import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
allticks_data = pd.read_csv("all_ticks_wide.csv")
allticks_data
| timestamp | AEFES | AKBNK | AKSA | AKSEN | ALARK | ALBRK | ANACM | ARCLK | ASELS | ... | TTKOM | TUKAS | TUPRS | USAK | VAKBN | VESTL | YATAS | YKBNK | YUNSA | ZOREN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2012-09-17T06:45:00Z | 22.3978 | 5.2084 | 1.7102 | 3.87 | 1.4683 | 1.1356 | 1.0634 | 6.9909 | 2.9948 | ... | 4.2639 | 0.96 | 29.8072 | 1.0382 | 3.8620 | 1.90 | 0.4172 | 2.5438 | 2.2619 | 0.7789 |
| 1 | 2012-09-17T07:00:00Z | 22.3978 | 5.1938 | 1.7066 | 3.86 | 1.4574 | 1.1275 | 1.0634 | 6.9259 | 2.9948 | ... | 4.2521 | 0.96 | 29.7393 | 1.0382 | 3.8529 | 1.90 | 0.4229 | 2.5266 | 2.2462 | 0.7789 |
| 2 | 2012-09-17T07:15:00Z | 22.3978 | 5.2084 | 1.7102 | NaN | 1.4610 | 1.1356 | 1.0679 | 6.9909 | 2.9855 | ... | 4.2521 | 0.97 | 29.6716 | 1.0463 | 3.8436 | 1.91 | 0.4229 | 2.5266 | 2.2566 | 0.7789 |
| 3 | 2012-09-17T07:30:00Z | 22.3978 | 5.1938 | 1.7102 | 3.86 | 1.4537 | 1.1275 | 1.0679 | 6.9584 | 2.9855 | ... | 4.2521 | 0.97 | 29.7393 | 1.0382 | 3.8529 | 1.91 | 0.4286 | 2.5324 | 2.2619 | 0.7860 |
| 4 | 2012-09-17T07:45:00Z | 22.5649 | 5.2084 | 1.7102 | 3.87 | 1.4574 | 1.1356 | 1.0725 | 6.9909 | 2.9760 | ... | 4.2521 | 0.97 | 29.8072 | 1.0382 | 3.8620 | 1.90 | 0.4286 | 2.5324 | 2.2619 | 0.7789 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 50007 | 2019-07-23T14:00:00Z | 20.4800 | 7.7300 | 9.1400 | 2.47 | 3.2300 | 1.2100 | 2.8400 | 20.3000 | NaN | ... | 5.6000 | 4.34 | 131.6000 | 1.0500 | 4.8600 | 9.98 | 5.3500 | 2.7500 | 4.2500 | NaN |
| 50008 | 2019-07-23T14:15:00Z | 20.5000 | 7.7200 | 9.1400 | 2.47 | 3.2200 | 1.2100 | 2.8400 | 20.3200 | NaN | ... | 5.5700 | 4.35 | 131.5000 | 1.0500 | 4.8600 | 9.98 | 5.3400 | 2.7500 | 4.2400 | NaN |
| 50009 | 2019-07-23T14:30:00Z | 20.5000 | 7.7400 | 9.1300 | 2.46 | 3.2300 | 1.2100 | 2.8300 | 20.3400 | NaN | ... | 5.5700 | 4.36 | 131.5000 | 1.0500 | 4.8600 | 9.96 | 5.3400 | 2.7600 | 4.2400 | NaN |
| 50010 | 2019-07-23T14:45:00Z | 20.4000 | 7.7000 | 9.1400 | 2.47 | 3.2400 | 1.2100 | 2.8200 | 20.3800 | NaN | ... | 5.5700 | 4.35 | 131.3000 | 1.0400 | 4.8600 | 9.94 | 5.3400 | 2.7700 | 4.2400 | NaN |
| 50011 | 2019-07-23T15:00:00Z | 20.4600 | 7.7000 | 9.1400 | 2.47 | 3.2300 | 1.2000 | 2.8300 | 20.3200 | NaN | ... | 5.5600 | 4.34 | 131.8000 | 1.0500 | 4.8500 | 9.93 | 5.3300 | 2.7700 | 4.2400 | NaN |
50012 rows × 61 columns
df = allticks_data.drop('timestamp', axis = 1) df.mode(axis=0)
allticks_data.describe()
| AEFES | AKBNK | AKSA | AKSEN | ALARK | ALBRK | ANACM | ARCLK | ASELS | ASUZU | ... | TTKOM | TUKAS | TUPRS | USAK | VAKBN | VESTL | YATAS | YKBNK | YUNSA | ZOREN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 48131.000000 | 49209.000000 | 48594.000000 | 48171.000000 | 48335.000000 | 46862.000000 | 48165.000000 | 49045.000000 | 48803.000000 | 48433.000000 | ... | 49077.000000 | 45929.000000 | 49143.000000 | 47659.000000 | 49212.000000 | 48781.000000 | 46055.000000 | 49225.000000 | 45528.000000 | 48807.000000 |
| mean | 20.982235 | 6.473105 | 7.127504 | 3.183542 | 2.060859 | 1.365549 | 1.672102 | 15.388088 | 13.432535 | 6.467033 | ... | 5.660680 | 1.737529 | 62.994535 | 1.220452 | 4.735438 | 5.942711 | 2.434249 | 2.566327 | 4.079695 | 1.248124 |
| std | 2.494002 | 0.944955 | 2.710033 | 0.724332 | 0.575943 | 0.167824 | 0.788365 | 4.531459 | 9.624246 | 2.201036 | ... | 0.818598 | 0.867095 | 32.398117 | 0.459532 | 0.977889 | 2.830465 | 2.552377 | 0.422774 | 1.347020 | 0.311330 |
| min | 0.000100 | 0.000100 | 0.000100 | 0.000000 | 0.000100 | 1.025500 | 0.000100 | 0.000100 | 0.000100 | 0.000100 | ... | 0.000100 | 0.650000 | 0.000100 | 0.000100 | 0.000100 | 0.000000 | 0.000100 | 0.000100 | 0.000100 | 0.000100 |
| 25% | 19.160500 | 5.850000 | 5.208800 | 2.670000 | 1.568900 | 1.225100 | 1.047000 | 11.711100 | 4.940300 | 5.074800 | ... | 5.267300 | 1.060000 | 34.549100 | 0.957100 | 4.032200 | 4.020000 | 0.388600 | 2.268200 | 3.006700 | 1.033800 |
| 50% | 20.646500 | 6.305700 | 6.985300 | 2.930000 | 1.937600 | 1.360200 | 1.259700 | 15.010000 | 9.275700 | 5.949600 | ... | 5.746400 | 1.530000 | 49.554200 | 1.050000 | 4.474200 | 6.320000 | 0.965800 | 2.609300 | 4.107800 | 1.250000 |
| 75% | 22.732000 | 6.932500 | 8.720000 | 3.750000 | 2.421400 | 1.500000 | 2.402100 | 19.087700 | 22.756700 | 7.120000 | ... | 6.260000 | 2.130000 | 93.428700 | 1.370800 | 5.246000 | 7.450000 | 4.230000 | 2.874000 | 4.720600 | 1.426500 |
| max | 28.509000 | 9.212400 | 15.118900 | 5.190000 | 3.514300 | 2.190000 | 3.502100 | 26.427800 | 46.761600 | 15.280000 | ... | 7.350000 | 5.920000 | 139.293700 | 2.757800 | 7.581400 | 14.540000 | 10.674800 | 3.958100 | 9.527500 | 2.443000 |
8 rows × 60 columns
# Select all columns except the first column (timestamps)
columns_to_analyze = allticks_data.columns[1:]
# Calculate the variance, standard deviation, and coefficient of variation for each selected column
variances = allticks_data[columns_to_analyze].var()
std_deviations = allticks_data[columns_to_analyze].std()
means = allticks_data[columns_to_analyze].mean()
coeff_of_variation = (std_deviations / means) * 100 # Multiply by 100 to express as a percentage
# Create a new DataFrame to store the results
statistics_df = pd.DataFrame({
'Column': columns_to_analyze,
'Variance': variances,
'Standard Deviation': std_deviations,
'Coefficient of Variation': coeff_of_variation
})
# Display the calculated statistics
print(statistics_df)
# Save the statistics to an Excel file
statistics_df.to_excel('stats.xlsx', index=False)
Column Variance Standard Deviation Coefficient of Variation AEFES AEFES 6.220045 2.494002 11.886255 AKBNK AKBNK 0.892940 0.944955 14.598175 AKSA AKSA 7.344281 2.710033 38.022191 AKSEN AKSEN 0.524657 0.724332 22.752397 ALARK ALARK 0.331711 0.575943 27.946760 ALBRK ALBRK 0.028165 0.167824 12.289822 ANACM ANACM 0.621519 0.788365 47.148149 ARCLK ARCLK 20.534121 4.531459 29.447837 ASELS ASELS 92.626107 9.624246 71.648765 ASUZU ASUZU 4.844559 2.201036 34.034707 AYGAZ AYGAZ 6.814199 2.610402 32.219438 BAGFS BAGFS 13.090345 3.618058 34.765198 BANVT BANVT 39.278773 6.267278 82.159003 BRISA BRISA 1.677856 1.295321 19.791314 CCOLA CCOLA 45.524879 6.747213 18.289735 CEMAS CEMAS 0.639970 0.799981 66.164017 ECILC ECILC 0.948264 0.973788 46.910015 EREGL EREGL 7.240034 2.690731 64.378583 FROTO FROTO 217.051399 14.732664 44.966434 GARAN GARAN 1.561593 1.249637 15.818726 GOODY GOODY 0.785805 0.886456 28.572461 GUBRF GUBRF 1.495699 1.222988 28.255460 HALKB HALKB 9.434500 3.071563 28.129534 ICBCT ICBCT 3.203683 1.789883 63.280257 ISCTR ISCTR 1.006784 1.003386 19.572350 ISDMR ISDMR 2.882925 1.697918 31.726922 ISFIN ISFIN 3.114658 1.764839 113.172823 ISYAT ISYAT 0.025679 0.160246 29.822208 KAREL KAREL 4.552330 2.133619 67.136667 KARSN KARSN 0.084340 0.290413 21.886500 KCHOL KCHOL 10.121585 3.181444 25.974593 KRDMB KRDMB 0.471124 0.686385 30.879316 KRDMD KRDMD 0.883774 0.940092 53.160903 MGROS MGROS 15.219903 3.901269 19.928407 OTKAR OTKAR 771.885345 27.782825 34.123048 PARSN PARSN 21.738631 4.662471 56.330518 PETKM PETKM 1.900291 1.378510 54.288370 PGSUS PGSUS 58.622534 7.656535 30.886220 PRKME PRKME 0.521210 0.721949 24.664232 SAHOL SAHOL 0.912617 0.955310 11.087758 SASA SASA 6.214720 2.492934 108.630461 SISE SISE 2.024498 1.422848 46.675752 SKBNK SKBNK 0.086971 0.294908 20.012042 SODA SODA 4.185150 2.045764 64.138753 TCELL TCELL 5.551913 2.356250 23.974859 THYAO THYAO 16.219089 4.027293 43.356341 TKFEN TKFEN 44.449512 6.667047 72.532480 TOASO TOASO 40.046632 6.328241 38.128192 TRKCM TRKCM 1.209267 1.099667 54.228227 TSKB TSKB 0.024111 0.155276 16.427277 TTKOM TTKOM 0.670103 0.818598 14.461129 TUKAS TUKAS 0.751854 0.867095 49.903932 TUPRS TUPRS 1049.637996 32.398117 51.430044 USAK USAK 0.211170 0.459532 37.652640 VAKBN VAKBN 0.956266 0.977889 20.650439 VESTL VESTL 8.011531 2.830465 47.629185 YATAS YATAS 6.514629 2.552377 104.852755 YKBNK YKBNK 0.178737 0.422774 16.473878 YUNSA YUNSA 1.814463 1.347020 33.017664 ZOREN ZOREN 0.096926 0.311330 24.943838
# Assuming your DataFrame is named "allticks_data"
# Select all the columns containing stock prices (excluding the "timestamps" column)
stock_columns = allticks_data.columns[1:] # Exclude the first column (timestamps)
# Create a box plot for each stock price column
for company in stock_columns:
sns.set(style="whitegrid") # Set the style of the plot (customize as needed)
sns.boxplot(x=allticks_data[company])
plt.xlabel("Price")
plt.title(f"Box Plot of {company} Stock Price")
plt.show()
# Select all the columns containing stock prices (excluding the "timestamps" column)
stock_columns = allticks_data.columns[1:] # Exclude the first column (timestamps)
# Create a single combined box plot for all companies' stock prices
sns.set(style="whitegrid") # Set the style of the plot (customize as needed)
sns.boxplot(data=allticks_data[stock_columns])
plt.xticks(rotation=90) # Rotate x-axis labels for readability
plt.xlabel("Company")
plt.ylabel("Price")
plt.title("Combined Box Plot of Stock Prices for All Companies")
plt.show()
ALBRK, ASUZU, CEMAS, ICBCT, ISFIN, PARSN, TUKAS and some others have many outliers on the maximum side.
Outliers that are bigger than the maximum value of the whisker: The stock price increased or decreased significantly, basically it had smaller values most of the time but there are a few data that are significantly higher. There are many little values so that Q1 and Q3, thus, min and max whisker is calculated based on those values. Max value is calculated as a small value and all the bigger numbers are considered as an outlier.
ISDMR, TTKOM have many outliers on the minimum side. The opposite case of the maximum is valid in this case. However, those who have maximum side outliers are much more since the stock market is gaining popularity with time. The earlier values of all companies are small and similar in the beginning but they differ after some time.
# Select the columns containing stock prices (excluding the timestamp column)
stock_prices = allticks_data.iloc[:, 1:]
# Calculate the correlation matrix
correlation_matrix = stock_prices.corr()
# You now have a 60x60 correlation matrix
print(correlation_matrix)
correlation_matrix.to_excel('correlation_matrix.xlsx',index=False)
AEFES AKBNK AKSA AKSEN ALARK ALBRK ANACM \
AEFES 1.000000 0.267502 -0.000625 0.528379 0.319894 0.496981 0.117802
AKBNK 0.267502 1.000000 0.572321 0.548420 0.657648 0.165933 0.439169
AKSA -0.000625 0.572321 1.000000 0.280772 0.636207 -0.045111 0.712541
AKSEN 0.528379 0.548420 0.280772 1.000000 0.470681 0.283348 0.469720
ALARK 0.319894 0.657648 0.636207 0.470681 1.000000 0.158375 0.736034
ALBRK 0.496981 0.165933 -0.045111 0.283348 0.158375 1.000000 0.082215
ANACM 0.117802 0.439169 0.712541 0.469720 0.736034 0.082215 1.000000
ARCLK -0.335744 0.644439 0.735731 0.002287 0.464037 -0.243290 0.444435
ASELS 0.032488 0.583853 0.844647 0.439348 0.752573 -0.052004 0.893538
ASUZU 0.442354 0.587298 0.760313 0.594102 0.719034 0.318581 0.753061
AYGAZ -0.039336 0.680857 0.899364 0.342756 0.704378 -0.087295 0.775918
BAGFS 0.002867 0.030708 -0.273922 0.025920 -0.415153 -0.046602 -0.535003
BANVT 0.161839 0.658539 0.776519 0.499313 0.865129 -0.046694 0.832482
BRISA -0.227536 0.298082 0.478165 -0.141430 -0.058068 0.080278 0.051265
CCOLA 0.479695 -0.083399 -0.418557 -0.079760 -0.129134 0.485198 -0.557608
CEMAS 0.220689 0.480677 0.578001 0.670041 0.528057 0.020543 0.653769
ECILC -0.028022 0.687260 0.893949 0.420829 0.728123 -0.124651 0.788193
EREGL 0.022334 0.427050 0.830970 0.336232 0.657187 -0.010935 0.931368
FROTO 0.014181 0.417780 0.817036 0.308924 0.654174 0.014792 0.927516
GARAN 0.330771 0.936752 0.630530 0.524864 0.725833 0.265882 0.576837
GOODY -0.148720 0.536148 0.567748 0.153148 0.282585 -0.034644 0.346986
GUBRF -0.223603 0.133821 0.158594 -0.196855 -0.397135 -0.085523 -0.440147
HALKB 0.434685 0.117774 -0.532093 0.227179 -0.151374 0.375976 -0.593122
ICBCT 0.090316 0.331142 0.652518 0.312171 0.531813 0.020797 0.779935
ISCTR 0.304757 0.909713 0.642620 0.519119 0.753016 0.212024 0.519361
ISDMR 0.297087 -0.299530 0.347700 0.243702 0.307496 0.294577 0.860045
ISFIN -0.127835 0.069228 0.389008 -0.072826 0.407486 0.166525 0.732345
ISYAT -0.189416 0.285470 0.702517 -0.080503 0.587087 -0.065670 0.788362
KAREL 0.181108 0.561646 0.790006 0.474214 0.813202 0.138252 0.934658
KARSN 0.279880 0.495951 0.669682 0.426398 0.388063 0.367376 0.504093
KCHOL -0.143505 0.623556 0.838016 0.207000 0.658166 -0.050933 0.799675
KRDMB 0.549010 0.065148 0.276477 0.264686 0.451875 0.477526 0.252403
KRDMD 0.292833 0.331925 0.685565 0.484434 0.518801 0.208807 0.799184
MGROS 0.412668 0.766520 0.291698 0.600075 0.403851 0.201810 0.037760
OTKAR -0.368310 0.577816 0.737121 -0.026571 0.453374 -0.213952 0.476567
PARSN -0.075655 0.342925 0.736904 0.303896 0.467247 -0.094028 0.820097
PETKM -0.016443 0.620385 0.865281 0.374754 0.727185 -0.057930 0.884786
PGSUS 0.518629 0.102545 0.155788 0.216294 0.548264 0.476764 0.325378
PRKME 0.624415 0.432895 0.006464 0.696566 0.499550 0.329030 0.162587
SAHOL 0.421908 0.874744 0.365626 0.473875 0.492877 0.279617 0.139630
SASA 0.064751 0.345486 0.720528 0.375461 0.650567 0.050585 0.963165
SISE -0.139481 0.392096 0.765385 0.177282 0.614721 -0.001287 0.911225
SKBNK 0.529226 -0.026489 -0.339876 0.076806 -0.189774 0.535323 -0.559730
SODA -0.255682 0.249245 0.699090 0.068726 0.502520 -0.115282 0.859063
TCELL 0.086501 0.452946 0.818100 0.089541 0.622756 0.175730 0.727762
THYAO 0.243128 0.258007 0.673206 0.395772 0.550375 0.279413 0.851014
TKFEN -0.018680 0.260238 0.555490 0.212043 0.633635 0.052638 0.927872
TOASO -0.165691 0.673631 0.895798 0.256923 0.586738 -0.221665 0.622388
TRKCM 0.091527 0.505256 0.852854 0.411484 0.732022 0.065874 0.927069
TSKB 0.333290 0.856766 0.544898 0.590779 0.427739 0.182478 0.209076
TTKOM 0.255756 0.578765 0.131398 0.211780 0.080230 0.235083 -0.306898
TUKAS -0.204048 0.315863 0.561769 -0.077261 0.498251 0.024473 0.589974
TUPRS -0.100480 0.384826 0.732025 0.235223 0.655195 -0.018337 0.928284
USAK 0.083471 0.785438 0.728294 0.581016 0.652558 -0.075049 0.552056
VAKBN 0.397543 0.942930 0.512615 0.606355 0.653074 0.304312 0.346859
VESTL -0.150159 0.484747 0.776674 0.136128 0.509264 -0.016297 0.741402
YATAS 0.235401 0.593489 0.821989 0.456702 0.791923 0.081669 0.885328
YKBNK 0.538889 0.516761 -0.169104 0.375938 0.161212 0.434805 -0.314139
YUNSA 0.475578 0.083837 0.394182 0.451502 0.374540 0.460184 0.565684
ZOREN 0.061791 0.627474 0.701427 0.371093 0.391374 0.193623 0.515836
ARCLK ASELS ASUZU ... TTKOM TUKAS TUPRS \
AEFES -0.335744 0.032488 0.442354 ... 0.255756 -0.204048 -0.100480
AKBNK 0.644439 0.583853 0.587298 ... 0.578765 0.315863 0.384826
AKSA 0.735731 0.844647 0.760313 ... 0.131398 0.561769 0.732025
AKSEN 0.002287 0.439348 0.594102 ... 0.211780 -0.077261 0.235223
ALARK 0.464037 0.752573 0.719034 ... 0.080230 0.498251 0.655195
ALBRK -0.243290 -0.052004 0.318581 ... 0.235083 0.024473 -0.018337
ANACM 0.444435 0.893538 0.753061 ... -0.306898 0.589974 0.928284
ARCLK 1.000000 0.645008 0.332156 ... 0.238665 0.528223 0.570077
ASELS 0.645008 1.000000 0.779175 ... -0.125976 0.528841 0.893495
ASUZU 0.332156 0.779175 1.000000 ... 0.114170 0.377230 0.636567
AYGAZ 0.832983 0.929203 0.692950 ... 0.051660 0.507003 0.813354
BAGFS -0.145831 -0.403790 -0.308348 ... 0.453879 -0.262078 -0.512008
BANVT 0.584277 0.936341 0.781870 ... -0.028089 0.490990 0.808954
BRISA 0.490160 0.251358 0.289053 ... 0.378291 0.341308 0.204787
CCOLA -0.403100 -0.564035 -0.198823 ... 0.404175 -0.462148 -0.635989
CEMAS 0.305539 0.689888 0.656293 ... -0.037923 0.071991 0.494005
ECILC 0.800039 0.939423 0.707779 ... 0.054373 0.480125 0.795279
EREGL 0.562972 0.925961 0.774213 ... -0.253603 0.588243 0.929331
FROTO 0.549083 0.909404 0.758415 ... -0.251237 0.640170 0.943490
GARAN 0.585013 0.670082 0.721524 ... 0.446646 0.464070 0.535912
GOODY 0.676047 0.559465 0.382466 ... 0.189484 0.267082 0.439865
GUBRF 0.270226 -0.178222 -0.120396 ... 0.610556 -0.025786 -0.291051
HALKB -0.425135 -0.559875 -0.276755 ... 0.533515 -0.528944 -0.694929
ICBCT 0.394651 0.836779 0.666557 ... -0.294283 0.375011 0.808694
ISCTR 0.600099 0.608127 0.665832 ... 0.514890 0.435206 0.463898
ISDMR -0.567658 0.489981 0.530153 ... -0.450385 0.251758 0.735507
ISFIN 0.322790 0.547116 0.367441 ... -0.463556 0.690668 0.802626
ISYAT 0.609967 0.719683 0.525800 ... -0.297996 0.796818 0.875561
KAREL 0.508408 0.894148 0.818012 ... -0.109278 0.660567 0.882167
KARSN 0.267356 0.565763 0.815915 ... 0.289611 0.385659 0.455140
KCHOL 0.830299 0.878205 0.636624 ... -0.012342 0.722824 0.886318
KRDMB -0.175953 0.201025 0.535803 ... 0.055775 -0.052234 0.105183
KRDMD 0.250640 0.763269 0.837706 ... -0.187339 0.319795 0.717513
MGROS 0.282331 0.254276 0.391553 ... 0.739648 -0.084991 -0.055947
OTKAR 0.917773 0.636471 0.376711 ... 0.195679 0.584642 0.586448
PARSN 0.514000 0.819982 0.628660 ... -0.315603 0.438614 0.820912
PETKM 0.732698 0.963461 0.745418 ... -0.087614 0.596628 0.891300
PGSUS -0.309820 0.170161 0.478515 ... -0.027542 0.276271 0.211858
PRKME -0.208820 0.181213 0.355657 ... 0.323079 -0.120132 -0.018191
SAHOL 0.461832 0.278019 0.425178 ... 0.762092 0.096913 0.059683
SASA 0.426195 0.877541 0.734678 ... -0.368041 0.588859 0.935314
SISE 0.648147 0.879780 0.631260 ... -0.305630 0.702447 0.962189
SKBNK -0.485077 -0.482530 -0.032653 ... 0.498479 -0.440434 -0.640367
SODA 0.600288 0.813426 0.511613 ... -0.422378 0.694059 0.950182
TCELL 0.588828 0.783621 0.740609 ... -0.013502 0.653890 0.780493
THYAO 0.208139 0.766054 0.818347 ... -0.276484 0.469554 0.792173
TKFEN 0.376929 0.778524 0.586514 ... -0.471644 0.687804 0.936246
TOASO 0.894959 0.838014 0.604371 ... 0.189400 0.482214 0.684919
TRKCM 0.583270 0.941957 0.825733 ... -0.163798 0.536308 0.892095
TSKB 0.500843 0.429668 0.563091 ... 0.737490 0.083406 0.131547
TTKOM 0.238665 -0.125976 0.114170 ... 1.000000 -0.047513 -0.337725
TUKAS 0.528223 0.528841 0.377230 ... -0.047513 1.000000 0.731186
TUPRS 0.570077 0.893495 0.636567 ... -0.337725 0.731186 1.000000
USAK 0.651974 0.757735 0.613202 ... 0.339795 0.316385 0.531317
VAKBN 0.524931 0.489668 0.598292 ... 0.647927 0.200089 0.255812
VESTL 0.673333 0.747866 0.660330 ... -0.039156 0.716806 0.793732
YATAS 0.526898 0.931239 0.849296 ... -0.053002 0.539603 0.838108
YKBNK -0.119385 -0.259706 0.105741 ... 0.752652 -0.242727 -0.442756
YUNSA -0.186253 0.420727 0.696278 ... -0.167863 0.126451 0.416737
ZOREN 0.545555 0.635820 0.720764 ... 0.326075 0.377858 0.500383
USAK VAKBN VESTL YATAS YKBNK YUNSA ZOREN
AEFES 0.083471 0.397543 -0.150159 0.235401 0.538889 0.475578 0.061791
AKBNK 0.785438 0.942930 0.484747 0.593489 0.516761 0.083837 0.627474
AKSA 0.728294 0.512615 0.776674 0.821989 -0.169104 0.394182 0.701427
AKSEN 0.581016 0.606355 0.136128 0.456702 0.375938 0.451502 0.371093
ALARK 0.652558 0.653074 0.509264 0.791923 0.161212 0.374540 0.391374
ALBRK -0.075049 0.304312 -0.016297 0.081669 0.434805 0.460184 0.193623
ANACM 0.552056 0.346859 0.741402 0.885328 -0.314139 0.565684 0.515836
ARCLK 0.651974 0.524931 0.673333 0.526898 -0.119385 -0.186253 0.545555
ASELS 0.757735 0.489668 0.747866 0.931239 -0.259706 0.420727 0.635820
ASUZU 0.613202 0.598292 0.660330 0.849296 0.105741 0.696278 0.720764
AYGAZ 0.809034 0.592549 0.758866 0.836522 -0.169948 0.257124 0.659939
BAGFS -0.006847 0.070047 -0.271655 -0.421186 0.405499 -0.373769 -0.007022
BANVT 0.798737 0.587479 0.669037 0.898647 -0.075599 0.371989 0.566044
BRISA 0.264396 0.240770 0.537781 0.154711 0.018329 0.094437 0.666418
CCOLA -0.342572 0.106403 -0.556101 -0.478281 0.618194 0.048334 -0.297058
CEMAS 0.631888 0.451906 0.412297 0.654709 -0.030140 0.472474 0.479668
ECILC 0.845493 0.604237 0.749101 0.853425 -0.153317 0.279614 0.662133
EREGL 0.579126 0.324807 0.843051 0.891983 -0.363152 0.582277 0.623334
FROTO 0.531192 0.301170 0.821596 0.886083 -0.369152 0.565362 0.615279
GARAN 0.735682 0.889723 0.591568 0.716645 0.416988 0.274365 0.684231
GOODY 0.553901 0.449401 0.590861 0.428813 -0.003119 0.012931 0.626049
GUBRF 0.119931 0.123224 0.021861 -0.283812 0.181517 -0.308403 0.273674
HALKB -0.085364 0.289094 -0.663312 -0.526251 0.796584 -0.182357 -0.316223
ICBCT 0.506300 0.234724 0.587006 0.765115 -0.362153 0.540416 0.503495
ISCTR 0.785017 0.915960 0.538922 0.644241 0.465772 0.248998 0.585588
ISDMR -0.172702 -0.284470 0.470635 0.521031 -0.307184 0.747014 0.105374
ISFIN 0.080190 -0.015633 0.638451 0.576695 -0.497536 0.347318 0.262856
ISYAT 0.304051 0.142741 0.806164 0.707975 -0.450457 0.337598 0.455052
KAREL 0.647139 0.502028 0.779622 0.933984 -0.146574 0.530879 0.594751
KARSN 0.488105 0.493382 0.615618 0.641713 0.134530 0.569537 0.781788
KCHOL 0.659195 0.484400 0.866697 0.794275 -0.234732 0.233001 0.669182
KRDMB 0.089792 0.224270 0.052380 0.378105 0.197284 0.706336 0.160795
KRDMD 0.445333 0.300068 0.688775 0.790928 -0.175728 0.809497 0.630392
MGROS 0.700536 0.839945 0.056260 0.236725 0.722851 0.053126 0.403012
OTKAR 0.550614 0.452672 0.720430 0.531581 -0.169528 -0.114003 0.582256
PARSN 0.494088 0.197149 0.735754 0.749385 -0.428178 0.458106 0.544246
PETKM 0.724798 0.503126 0.810474 0.915139 -0.263232 0.345646 0.680342
PGSUS 0.019848 0.170019 0.093652 0.366702 0.241310 0.590951 0.093774
PRKME 0.424865 0.546548 -0.186046 0.283737 0.565032 0.247195 0.057171
SAHOL 0.566766 0.897199 0.220474 0.320306 0.734385 0.036501 0.461139
SASA 0.501127 0.249046 0.749320 0.863806 -0.404375 0.606613 0.511793
SISE 0.499419 0.263305 0.841056 0.807107 -0.453436 0.420605 0.574375
SKBNK -0.220973 0.168601 -0.505531 -0.347009 0.653931 0.065890 -0.084631
SODA 0.415936 0.103986 0.799403 0.709057 -0.580021 0.365282 0.463037
TCELL 0.454743 0.383525 0.753048 0.797806 -0.207289 0.512561 0.648446
THYAO 0.360347 0.210835 0.643835 0.801843 -0.305493 0.777343 0.574916
TKFEN 0.320969 0.131223 0.719123 0.767302 -0.447145 0.486685 0.384886
TOASO 0.791129 0.573100 0.759496 0.744917 -0.134525 0.093819 0.686027
TRKCM 0.634915 0.433015 0.812177 0.911697 -0.267388 0.604393 0.667266
TSKB 0.731866 0.880894 0.347174 0.453043 0.574964 0.090291 0.668327
TTKOM 0.339795 0.647927 -0.039156 -0.053002 0.752652 -0.167863 0.326075
TUKAS 0.316385 0.200089 0.716806 0.539603 -0.242727 0.126451 0.377858
TUPRS 0.531317 0.255812 0.793732 0.838108 -0.442756 0.416737 0.500383
USAK 1.000000 0.755431 0.493929 0.661793 0.175051 0.143049 0.564730
VAKBN 0.755431 1.000000 0.357053 0.532766 0.634247 0.140449 0.564502
VESTL 0.493929 0.357053 1.000000 0.750194 -0.223108 0.355600 0.759132
YATAS 0.661793 0.532766 0.750194 1.000000 -0.132810 0.518503 0.645384
YKBNK 0.175051 0.634247 -0.223108 -0.132810 1.000000 -0.017644 0.110198
YUNSA 0.143049 0.140449 0.355600 0.518503 -0.017644 1.000000 0.378837
ZOREN 0.564730 0.564502 0.759132 0.645384 0.110198 0.378837 1.000000
[60 rows x 60 columns]
--------------------------------------------------------------------------- PermissionError Traceback (most recent call last) Cell In[65], line 10 7 # You now have a 60x60 correlation matrix 8 print(correlation_matrix) ---> 10 correlation_matrix.to_excel('correlation_matrix.xlsx',index=False) File ~\anaconda3\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File ~\anaconda3\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2374, in NDFrame.to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep, verbose, freeze_panes, storage_options) 2361 from pandas.io.formats.excel import ExcelFormatter 2363 formatter = ExcelFormatter( 2364 df, 2365 na_rep=na_rep, (...) 2372 inf_rep=inf_rep, 2373 ) -> 2374 formatter.write( 2375 excel_writer, 2376 sheet_name=sheet_name, 2377 startrow=startrow, 2378 startcol=startcol, 2379 freeze_panes=freeze_panes, 2380 engine=engine, 2381 storage_options=storage_options, 2382 ) File ~\anaconda3\lib\site-packages\pandas\io\formats\excel.py:944, in ExcelFormatter.write(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options) 940 need_save = False 941 else: 942 # error: Cannot instantiate abstract class 'ExcelWriter' with abstract 943 # attributes 'engine', 'save', 'supported_extensions' and 'write_cells' --> 944 writer = ExcelWriter( # type: ignore[abstract] 945 writer, engine=engine, storage_options=storage_options 946 ) 947 need_save = True 949 try: File ~\anaconda3\lib\site-packages\pandas\io\excel\_openpyxl.py:60, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs) 56 from openpyxl.workbook import Workbook 58 engine_kwargs = combine_kwargs(engine_kwargs, kwargs) ---> 60 super().__init__( 61 path, 62 mode=mode, 63 storage_options=storage_options, 64 if_sheet_exists=if_sheet_exists, 65 engine_kwargs=engine_kwargs, 66 ) 68 # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from 69 # the file and later write to it 70 if "r+" in self._mode: # Load from existing workbook File ~\anaconda3\lib\site-packages\pandas\io\excel\_base.py:1313, in ExcelWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs) 1309 self._handles = IOHandles( 1310 cast(IO[bytes], path), compression={"compression": None} 1311 ) 1312 if not isinstance(path, ExcelWriter): -> 1313 self._handles = get_handle( 1314 path, mode, storage_options=storage_options, is_text=False 1315 ) 1316 self._cur_sheet = None 1318 if date_format is None: File ~\anaconda3\lib\site-packages\pandas\io\common.py:865, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 856 handle = open( 857 handle, 858 ioargs.mode, (...) 861 newline="", 862 ) 863 else: 864 # Binary mode --> 865 handle = open(handle, ioargs.mode) 866 handles.append(handle) 868 # Convert BytesIO or file objects passed with an encoding PermissionError: [Errno 13] Permission denied: 'correlation_matrix.xlsx'
#highest correlations
# Flatten the upper triangular part of the correlation matrix, excluding the diagonal
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
# Sort the values in descending order
sorted_values = upper_triangle.unstack().sort_values(ascending=False)
# Select the top values
top_values = sorted_values[0:60]
# Get the corresponding row and column names
top_corr_pairs = top_values.index
top_corr_values = top_values.values
# Print the results
for pair, value in zip(top_corr_pairs, top_corr_values):
row, col = pair
print(f"Correlation between {row} and {col}: {value:.4f}")
Correlation between TRKCM and EREGL: 0.9740 Correlation between FROTO and EREGL: 0.9725 Correlation between ECILC and AYGAZ: 0.9711 Correlation between SODA and SISE: 0.9702 Correlation between PETKM and ASELS: 0.9635 Correlation between SASA and ANACM: 0.9632 Correlation between TUPRS and SISE: 0.9622 Correlation between SASA and FROTO: 0.9560 Correlation between SASA and EREGL: 0.9542 Correlation between TUPRS and SODA: 0.9502 Correlation between PETKM and AYGAZ: 0.9472 Correlation between TRKCM and FROTO: 0.9462 Correlation between TUPRS and FROTO: 0.9435 Correlation between VAKBN and AKBNK: 0.9429 Correlation between TRKCM and ASELS: 0.9420 Correlation between PETKM and ECILC: 0.9418 Correlation between ECILC and ASELS: 0.9394 Correlation between TOASO and AYGAZ: 0.9374 Correlation between SISE and FROTO: 0.9371 Correlation between GARAN and AKBNK: 0.9368 Correlation between BANVT and ASELS: 0.9363 Correlation between TUPRS and TKFEN: 0.9362 Correlation between TUPRS and SASA: 0.9353 Correlation between TOASO and ECILC: 0.9351 Correlation between KAREL and ANACM: 0.9347 Correlation between SISE and EREGL: 0.9343 Correlation between YATAS and KAREL: 0.9340 Correlation between ISCTR and GARAN: 0.9322 Correlation between SASA and ISDMR: 0.9321 Correlation between EREGL and ANACM: 0.9314 Correlation between TKFEN and SASA: 0.9313 Correlation between YATAS and ASELS: 0.9312 Correlation between TUPRS and EREGL: 0.9293 Correlation between TRKCM and PETKM: 0.9292 Correlation between AYGAZ and ASELS: 0.9292 Correlation between TUPRS and ANACM: 0.9283 Correlation between TKFEN and ANACM: 0.9279 Correlation between FROTO and ANACM: 0.9275 Correlation between TRKCM and ANACM: 0.9271 Correlation between EREGL and ASELS: 0.9260 Correlation between TRKCM and SASA: 0.9253 Correlation between PETKM and KCHOL: 0.9239 Correlation between KCHOL and AYGAZ: 0.9233 Correlation between TRKCM and SISE: 0.9191 Correlation between OTKAR and ARCLK: 0.9178 Correlation between SISE and KCHOL: 0.9163 Correlation between VAKBN and ISCTR: 0.9160 Correlation between YATAS and PETKM: 0.9151 Correlation between PETKM and EREGL: 0.9141 Correlation between SISE and SASA: 0.9134 Correlation between YATAS and TRKCM: 0.9117 Correlation between SISE and ANACM: 0.9112 Correlation between ISCTR and AKBNK: 0.9097 Correlation between TRKCM and KAREL: 0.9096 Correlation between TKFEN and SISE: 0.9095 Correlation between FROTO and ASELS: 0.9094 Correlation between PETKM and FROTO: 0.9089 Correlation between ISDMR and FROTO: 0.9070 Correlation between SISE and PETKM: 0.9068 Correlation between SASA and KAREL: 0.9046
#I selected these two because they had the highest coefficients of variations
# Select the specified columns
selected_columns = ["ISFIN", "SASA"]
# Initialize a list to store correlation values and corresponding timestamps
correlations_with_timestamps = []
# Calculate the correlation and create and display separate plots for each set of 750 rows
num_sets = len(allticks_data) // 750
for i in range(num_sets):
start_row = i * 750
end_row = (i + 1) * 750
period_data = allticks_data[selected_columns][start_row:end_row]
# Drop rows with missing values
period_data.dropna(inplace=True)
# Extract the data for each column
x = period_data["ISFIN"]
y = period_data["SASA"]
# Calculate the correlation coefficient and correlation
correlation_coefficient = np.corrcoef(x, y)[0, 1]
correlation = np.corrcoef(x, y)[0, 1] # Calculate the correlation itself
# Get the start and end timestamps
start_timestamp = allticks_data["timestamp"].iloc[start_row]
end_timestamp = allticks_data["timestamp"].iloc[end_row - 1]
# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")
# Set labels and title
plt.xlabel("ISFIN")
plt.ylabel("SASA")
plt.title(f"Scatter Plot for Rows {start_row+1}-{end_row}")
# Add text to display the correlation coefficient and correlation
plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
# Display the plot
plt.legend(loc="upper right")
plt.show()
# Determine the correlation sign based on the correlation coefficient
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
# Add the correlation and timestamps to the list
correlations_with_timestamps.append({
"Correlation": correlation,
"Correlation Sign": correlation_sign,
"Start Timestamp": start_timestamp,
"End Timestamp": end_timestamp,
"Start Row": start_row,
"End Row": end_row
})
# Display the list of correlation values and timestamps
for item in correlations_with_timestamps:
print(f"Correlation: {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}, Start Timestamp: {item['Start Timestamp']}, End Timestamp: {item['End Timestamp']}, Start Row: {item['Start Row']}, End Row: {item['End Row']}")
Correlation: -0.12, Sign: negative, Start Timestamp: 2012-09-17T06:45:00Z, End Timestamp: 2012-10-31T10:00:00Z, Start Row: 0, End Row: 750 Correlation: 0.71, Sign: positive, Start Timestamp: 2012-10-31T10:15:00Z, End Timestamp: 2012-12-11T09:00:00Z, Start Row: 750, End Row: 1500 Correlation: 0.56, Sign: positive, Start Timestamp: 2012-12-11T09:15:00Z, End Timestamp: 2013-01-22T08:45:00Z, Start Row: 1500, End Row: 2250 Correlation: 0.80, Sign: positive, Start Timestamp: 2013-01-22T09:00:00Z, End Timestamp: 2013-03-04T07:45:00Z, Start Row: 2250, End Row: 3000 Correlation: -0.15, Sign: negative, Start Timestamp: 2013-03-04T08:00:00Z, End Timestamp: 2013-04-11T13:15:00Z, Start Row: 3000, End Row: 3750 Correlation: -0.16, Sign: negative, Start Timestamp: 2013-04-11T13:30:00Z, End Timestamp: 2013-05-23T11:45:00Z, Start Row: 3750, End Row: 4500 Correlation: 0.87, Sign: positive, Start Timestamp: 2013-05-23T12:00:00Z, End Timestamp: 2013-07-03T08:45:00Z, Start Row: 4500, End Row: 5250 Correlation: -0.04, Sign: negative, Start Timestamp: 2013-07-03T09:00:00Z, End Timestamp: 2013-08-16T08:00:00Z, Start Row: 5250, End Row: 6000 Correlation: 0.94, Sign: positive, Start Timestamp: 2013-08-16T08:15:00Z, End Timestamp: 2013-09-26T12:15:00Z, Start Row: 6000, End Row: 6750 Correlation: 0.94, Sign: positive, Start Timestamp: 2013-09-26T12:30:00Z, End Timestamp: 2013-11-13T13:00:00Z, Start Row: 6750, End Row: 7500 Correlation: 0.37, Sign: positive, Start Timestamp: 2013-11-13T13:15:00Z, End Timestamp: 2013-12-23T10:15:00Z, Start Row: 7500, End Row: 8250 Correlation: 0.10, Sign: positive, Start Timestamp: 2013-12-23T12:00:00Z, End Timestamp: 2014-01-31T10:00:00Z, Start Row: 8250, End Row: 9000 Correlation: 0.14, Sign: positive, Start Timestamp: 2014-01-31T10:15:00Z, End Timestamp: 2014-03-12T08:45:00Z, Start Row: 9000, End Row: 9750 Correlation: 0.85, Sign: positive, Start Timestamp: 2014-03-12T09:00:00Z, End Timestamp: 2014-04-21T12:15:00Z, Start Row: 9750, End Row: 10500 Correlation: 0.44, Sign: positive, Start Timestamp: 2014-04-21T12:30:00Z, End Timestamp: 2014-06-04T12:45:00Z, Start Row: 10500, End Row: 11250 Correlation: 0.19, Sign: positive, Start Timestamp: 2014-06-04T13:00:00Z, End Timestamp: 2014-07-14T08:00:00Z, Start Row: 11250, End Row: 12000 Correlation: 0.36, Sign: positive, Start Timestamp: 2014-07-14T08:15:00Z, End Timestamp: 2014-08-25T12:15:00Z, Start Row: 12000, End Row: 12750 Correlation: -0.75, Sign: negative, Start Timestamp: 2014-08-25T12:30:00Z, End Timestamp: 2014-10-01T13:00:00Z, Start Row: 12750, End Row: 13500 Correlation: 0.21, Sign: positive, Start Timestamp: 2014-10-01T13:15:00Z, End Timestamp: 2014-11-14T08:15:00Z, Start Row: 13500, End Row: 14250 Correlation: 0.56, Sign: positive, Start Timestamp: 2014-11-14T08:30:00Z, End Timestamp: 2014-12-23T08:30:00Z, Start Row: 14250, End Row: 15000 Correlation: 0.79, Sign: positive, Start Timestamp: 2014-12-23T08:45:00Z, End Timestamp: 2015-01-30T10:00:00Z, Start Row: 15000, End Row: 15750 Correlation: -0.02, Sign: negative, Start Timestamp: 2015-01-30T10:15:00Z, End Timestamp: 2015-03-10T13:30:00Z, Start Row: 15750, End Row: 16500 Correlation: 0.60, Sign: positive, Start Timestamp: 2015-03-10T13:45:00Z, End Timestamp: 2015-04-16T12:15:00Z, Start Row: 16500, End Row: 17250 Correlation: 0.21, Sign: positive, Start Timestamp: 2015-04-16T12:30:00Z, End Timestamp: 2015-05-28T13:15:00Z, Start Row: 17250, End Row: 18000 Correlation: -0.64, Sign: negative, Start Timestamp: 2015-05-28T13:30:00Z, End Timestamp: 2015-07-07T06:30:00Z, Start Row: 18000, End Row: 18750 Correlation: 0.49, Sign: positive, Start Timestamp: 2015-07-07T06:45:00Z, End Timestamp: 2015-08-14T11:30:00Z, Start Row: 18750, End Row: 19500 Correlation: -0.27, Sign: negative, Start Timestamp: 2015-08-14T11:45:00Z, End Timestamp: 2015-09-22T13:00:00Z, Start Row: 19500, End Row: 20250 Correlation: -0.24, Sign: negative, Start Timestamp: 2015-09-22T13:15:00Z, End Timestamp: 2015-11-09T09:30:00Z, Start Row: 20250, End Row: 21000 Correlation: 0.91, Sign: positive, Start Timestamp: 2015-11-09T09:45:00Z, End Timestamp: 2015-12-15T09:30:00Z, Start Row: 21000, End Row: 21750 Correlation: 0.61, Sign: positive, Start Timestamp: 2015-12-15T09:45:00Z, End Timestamp: 2016-01-20T08:15:00Z, Start Row: 21750, End Row: 22500 Correlation: 0.50, Sign: positive, Start Timestamp: 2016-01-20T08:30:00Z, End Timestamp: 2016-02-23T14:45:00Z, Start Row: 22500, End Row: 23250 Correlation: 0.89, Sign: positive, Start Timestamp: 2016-02-23T15:00:00Z, End Timestamp: 2016-03-29T10:45:00Z, Start Row: 23250, End Row: 24000 Correlation: 0.89, Sign: positive, Start Timestamp: 2016-03-29T11:00:00Z, End Timestamp: 2016-05-03T07:00:00Z, Start Row: 24000, End Row: 24750 Correlation: 0.82, Sign: positive, Start Timestamp: 2016-05-03T07:15:00Z, End Timestamp: 2016-06-07T13:30:00Z, Start Row: 24750, End Row: 25500 Correlation: 0.42, Sign: positive, Start Timestamp: 2016-06-07T13:45:00Z, End Timestamp: 2016-07-18T08:00:00Z, Start Row: 25500, End Row: 26250 Correlation: -0.54, Sign: negative, Start Timestamp: 2016-07-18T08:15:00Z, End Timestamp: 2016-08-19T14:30:00Z, Start Row: 26250, End Row: 27000 Correlation: 0.54, Sign: positive, Start Timestamp: 2016-08-22T06:30:00Z, End Timestamp: 2016-09-30T13:45:00Z, Start Row: 27000, End Row: 27750 Correlation: -0.36, Sign: negative, Start Timestamp: 2016-09-30T14:00:00Z, End Timestamp: 2016-11-07T07:45:00Z, Start Row: 27750, End Row: 28500 Correlation: 0.19, Sign: positive, Start Timestamp: 2016-11-07T08:00:00Z, End Timestamp: 2016-12-09T08:45:00Z, Start Row: 28500, End Row: 29250 Correlation: 0.15, Sign: positive, Start Timestamp: 2016-12-09T09:00:00Z, End Timestamp: 2017-01-12T08:15:00Z, Start Row: 29250, End Row: 30000 Correlation: -0.13, Sign: negative, Start Timestamp: 2017-01-12T08:30:00Z, End Timestamp: 2017-02-15T07:30:00Z, Start Row: 30000, End Row: 30750 Correlation: 0.28, Sign: positive, Start Timestamp: 2017-02-15T07:45:00Z, End Timestamp: 2017-03-21T07:30:00Z, Start Row: 30750, End Row: 31500 Correlation: 0.38, Sign: positive, Start Timestamp: 2017-03-21T07:45:00Z, End Timestamp: 2017-04-21T14:15:00Z, Start Row: 31500, End Row: 32250 Correlation: 0.04, Sign: positive, Start Timestamp: 2017-04-21T14:30:00Z, End Timestamp: 2017-05-29T12:30:00Z, Start Row: 32250, End Row: 33000 Correlation: 0.28, Sign: positive, Start Timestamp: 2017-05-29T12:45:00Z, End Timestamp: 2017-07-04T11:15:00Z, Start Row: 33000, End Row: 33750 Correlation: 0.88, Sign: positive, Start Timestamp: 2017-07-04T11:30:00Z, End Timestamp: 2017-08-07T08:30:00Z, Start Row: 33750, End Row: 34500 Correlation: -0.21, Sign: negative, Start Timestamp: 2017-08-07T08:45:00Z, End Timestamp: 2017-09-13T12:45:00Z, Start Row: 34500, End Row: 35250 Correlation: -0.13, Sign: negative, Start Timestamp: 2017-09-13T13:00:00Z, End Timestamp: 2017-10-17T12:00:00Z, Start Row: 35250, End Row: 36000 Correlation: -0.16, Sign: negative, Start Timestamp: 2017-10-17T12:15:00Z, End Timestamp: 2017-11-20T09:45:00Z, Start Row: 36000, End Row: 36750 Correlation: 0.64, Sign: positive, Start Timestamp: 2017-11-20T10:45:00Z, End Timestamp: 2017-12-22T08:00:00Z, Start Row: 36750, End Row: 37500 Correlation: 0.07, Sign: positive, Start Timestamp: 2017-12-22T08:15:00Z, End Timestamp: 2018-01-25T13:30:00Z, Start Row: 37500, End Row: 38250 Correlation: -0.39, Sign: negative, Start Timestamp: 2018-01-25T13:45:00Z, End Timestamp: 2018-02-28T12:30:00Z, Start Row: 38250, End Row: 39000 Correlation: 0.75, Sign: positive, Start Timestamp: 2018-02-28T12:45:00Z, End Timestamp: 2018-04-03T12:00:00Z, Start Row: 39000, End Row: 39750 Correlation: 0.10, Sign: positive, Start Timestamp: 2018-04-03T12:15:00Z, End Timestamp: 2018-05-09T09:15:00Z, Start Row: 39750, End Row: 40500 Correlation: -0.41, Sign: negative, Start Timestamp: 2018-05-09T09:30:00Z, End Timestamp: 2018-06-12T07:45:00Z, Start Row: 40500, End Row: 41250 Correlation: 0.16, Sign: positive, Start Timestamp: 2018-06-12T08:00:00Z, End Timestamp: 2018-07-17T10:45:00Z, Start Row: 41250, End Row: 42000 Correlation: 0.32, Sign: positive, Start Timestamp: 2018-07-17T11:00:00Z, End Timestamp: 2018-08-17T14:15:00Z, Start Row: 42000, End Row: 42750 Correlation: -0.30, Sign: negative, Start Timestamp: 2018-08-17T14:30:00Z, End Timestamp: 2018-09-28T09:30:00Z, Start Row: 42750, End Row: 43500 Correlation: 0.32, Sign: positive, Start Timestamp: 2018-09-28T09:45:00Z, End Timestamp: 2018-11-01T15:00:00Z, Start Row: 43500, End Row: 44250 Correlation: 0.49, Sign: positive, Start Timestamp: 2018-11-02T06:45:00Z, End Timestamp: 2018-12-05T11:30:00Z, Start Row: 44250, End Row: 45000 Correlation: 0.55, Sign: positive, Start Timestamp: 2018-12-05T11:45:00Z, End Timestamp: 2019-01-09T07:00:00Z, Start Row: 45000, End Row: 45750 Correlation: 0.58, Sign: positive, Start Timestamp: 2019-01-09T07:15:00Z, End Timestamp: 2019-02-11T11:15:00Z, Start Row: 45750, End Row: 46500 Correlation: 0.28, Sign: positive, Start Timestamp: 2019-02-11T11:30:00Z, End Timestamp: 2019-03-14T14:45:00Z, Start Row: 46500, End Row: 47250 Correlation: -0.11, Sign: negative, Start Timestamp: 2019-03-14T15:00:00Z, End Timestamp: 2019-04-17T10:00:00Z, Start Row: 47250, End Row: 48000 Correlation: 0.92, Sign: positive, Start Timestamp: 2019-04-17T10:45:00Z, End Timestamp: 2019-05-22T14:15:00Z, Start Row: 48000, End Row: 48750 Correlation: -0.69, Sign: negative, Start Timestamp: 2019-05-22T14:30:00Z, End Timestamp: 2019-06-28T15:00:00Z, Start Row: 48750, End Row: 49500
#I wanted to see those periods with negative correlation to see which one is increasing and decreasing.
# Define the row periods for negative correlations
row_periods = [(item["Start Row"], item["End Row"]) for item in correlations_with_timestamps if item["Correlation Sign"] == "negative"]
# Iterate through the row periods and create separate plots
for start_row, end_row in row_periods:
period_data = allticks_data[["ISFIN", "SASA"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing
# Extract the data for each column
x = period_data["ISFIN"]
y = period_data["SASA"]
# Create a new figure for each row period
plt.figure(figsize=(8, 6))
# Plot the data
plt.plot(x, label="ISFIN", color="blue")
plt.plot(y, label="SASA", color="red")
# Set labels and title
plt.xlabel("Row")
plt.ylabel("Value")
plt.title(f"Rows {start_row}-{end_row}")
# Add a legend
plt.legend()
# Show the individual plot
plt.show()
I chose ISFIN and SASA since they have the highest coefficients of variance, therefore, these two companies have the highest variations within this data (without having the effect of the magnitude of their stock values). I analyzed the periods where their correlations have negative sign. I aimed to see their unique relation in such periods rather than those periods where they can be affected from the overall market and have the same resulting change.
In the beginning, i.e. rows 1-750; ISFIN stock prices were higher than SASA but it started to decline through the end of this monthly period. SASA on the other hand, starts to increase. It can also be seen from the Google Trends data. It can be seen that SASA is trending more and more towards the end of the period.
Correlation data, IMG1: [https://ibb.co/JxY8y2k] Google Trends data, IMG2: [https://ibb.co/LJHJSYR]
In rows 26250-27000, (Start Timestamp: 2016-07-18T08:15:00Z, End Timestamp: 2016-08-19T14:30:00Z), ISFIN’s stock prices started lower than SASA’s but surpassed it at the middle of the month. It can be seen that after the beginning of the month, ISFIN is increasing and SASA is decreasing. Also in Google Trends data, even though SASA(red) has more number of searches in total, ISFIN had 7 peaks but SASA had 6. This shows that ISFIN has gained popularity in this month. Correlation data, IMG3: [https://ibb.co/3Y3Rn2Q] Google Trends data, IMG4: [https://ibb.co/0FSBNRH]
#I selected these two because they had the highest correlation
# Select the specified columns
selected_columns = ["TRKCM", "EREGL"]
# Initialize a list to store correlation values and corresponding timestamps
correlations_with_timestamps = []
# Calculate the correlation and create and display separate plots for each set of 750 rows
num_sets = len(allticks_data) // 750
for i in range(num_sets):
start_row = i * 750
end_row = (i + 1) * 750
period_data = allticks_data[selected_columns][start_row:end_row]
# Drop rows with missing values
period_data.dropna(inplace=True)
# Extract the data for each column
x = period_data["TRKCM"]
y = period_data["EREGL"]
# Calculate the correlation coefficient and correlation
correlation_coefficient = np.corrcoef(x, y)[0, 1]
correlation = np.corrcoef(x, y)[0, 1] # Calculate the correlation itself
# Get the start and end timestamps
start_timestamp = allticks_data["timestamp"].iloc[start_row]
end_timestamp = allticks_data["timestamp"].iloc[end_row - 1]
# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")
# Set labels and title
plt.xlabel("TRKCM")
plt.ylabel("EREGL")
plt.title(f"Scatter Plot for Rows {start_row+1}-{end_row}")
# Add text to display the correlation coefficient and correlation
plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
# Display the plot
plt.legend(loc="upper right")
plt.show()
# Determine the correlation sign based on the correlation coefficient
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
# Add the correlation and timestamps to the list
correlations_with_timestamps.append({
"Correlation": correlation,
"Correlation Sign": correlation_sign,
"Start Timestamp": start_timestamp,
"End Timestamp": end_timestamp,
"Start Row": start_row,
"End Row": end_row
})
# Display the list of correlation values and timestamps
for item in correlations_with_timestamps:
print(f"Correlation: {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}, Start Timestamp: {item['Start Timestamp']}, End Timestamp: {item['End Timestamp']}, Start Row: {item['Start Row']}, End Row: {item['End Row']}")
Correlation: 0.59, Sign: positive, Start Timestamp: 2012-09-17T06:45:00Z, End Timestamp: 2012-10-31T10:00:00Z, Start Row: 0, End Row: 750 Correlation: 0.28, Sign: positive, Start Timestamp: 2012-10-31T10:15:00Z, End Timestamp: 2012-12-11T09:00:00Z, Start Row: 750, End Row: 1500 Correlation: 0.92, Sign: positive, Start Timestamp: 2012-12-11T09:15:00Z, End Timestamp: 2013-01-22T08:45:00Z, Start Row: 1500, End Row: 2250 Correlation: 0.83, Sign: positive, Start Timestamp: 2013-01-22T09:00:00Z, End Timestamp: 2013-03-04T07:45:00Z, Start Row: 2250, End Row: 3000 Correlation: 0.53, Sign: positive, Start Timestamp: 2013-03-04T08:00:00Z, End Timestamp: 2013-04-11T13:15:00Z, Start Row: 3000, End Row: 3750 Correlation: 0.99, Sign: positive, Start Timestamp: 2013-04-11T13:30:00Z, End Timestamp: 2013-05-23T11:45:00Z, Start Row: 3750, End Row: 4500 Correlation: 0.75, Sign: positive, Start Timestamp: 2013-05-23T12:00:00Z, End Timestamp: 2013-07-03T08:45:00Z, Start Row: 4500, End Row: 5250 Correlation: 0.01, Sign: positive, Start Timestamp: 2013-07-03T09:00:00Z, End Timestamp: 2013-08-16T08:00:00Z, Start Row: 5250, End Row: 6000 Correlation: 0.56, Sign: positive, Start Timestamp: 2013-08-16T08:15:00Z, End Timestamp: 2013-09-26T12:15:00Z, Start Row: 6000, End Row: 6750 Correlation: 0.46, Sign: positive, Start Timestamp: 2013-09-26T12:30:00Z, End Timestamp: 2013-11-13T13:00:00Z, Start Row: 6750, End Row: 7500 Correlation: 0.84, Sign: positive, Start Timestamp: 2013-11-13T13:15:00Z, End Timestamp: 2013-12-23T10:15:00Z, Start Row: 7500, End Row: 8250 Correlation: -0.11, Sign: negative, Start Timestamp: 2013-12-23T12:00:00Z, End Timestamp: 2014-01-31T10:00:00Z, Start Row: 8250, End Row: 9000 Correlation: 0.70, Sign: positive, Start Timestamp: 2014-01-31T10:15:00Z, End Timestamp: 2014-03-12T08:45:00Z, Start Row: 9000, End Row: 9750 Correlation: 0.86, Sign: positive, Start Timestamp: 2014-03-12T09:00:00Z, End Timestamp: 2014-04-21T12:15:00Z, Start Row: 9750, End Row: 10500 Correlation: 0.64, Sign: positive, Start Timestamp: 2014-04-21T12:30:00Z, End Timestamp: 2014-06-04T12:45:00Z, Start Row: 10500, End Row: 11250 Correlation: 0.79, Sign: positive, Start Timestamp: 2014-06-04T13:00:00Z, End Timestamp: 2014-07-14T08:00:00Z, Start Row: 11250, End Row: 12000 Correlation: 0.56, Sign: positive, Start Timestamp: 2014-07-14T08:15:00Z, End Timestamp: 2014-08-25T12:15:00Z, Start Row: 12000, End Row: 12750 Correlation: 0.33, Sign: positive, Start Timestamp: 2014-08-25T12:30:00Z, End Timestamp: 2014-10-01T13:00:00Z, Start Row: 12750, End Row: 13500 Correlation: 0.72, Sign: positive, Start Timestamp: 2014-10-01T13:15:00Z, End Timestamp: 2014-11-14T08:15:00Z, Start Row: 13500, End Row: 14250 Correlation: 0.15, Sign: positive, Start Timestamp: 2014-11-14T08:30:00Z, End Timestamp: 2014-12-23T08:30:00Z, Start Row: 14250, End Row: 15000 Correlation: 0.22, Sign: positive, Start Timestamp: 2014-12-23T08:45:00Z, End Timestamp: 2015-01-30T10:00:00Z, Start Row: 15000, End Row: 15750 Correlation: 0.74, Sign: positive, Start Timestamp: 2015-01-30T10:15:00Z, End Timestamp: 2015-03-10T13:30:00Z, Start Row: 15750, End Row: 16500 Correlation: 0.87, Sign: positive, Start Timestamp: 2015-03-10T13:45:00Z, End Timestamp: 2015-04-16T12:15:00Z, Start Row: 16500, End Row: 17250 Correlation: 0.05, Sign: positive, Start Timestamp: 2015-04-16T12:30:00Z, End Timestamp: 2015-05-28T13:15:00Z, Start Row: 17250, End Row: 18000 Correlation: 0.28, Sign: positive, Start Timestamp: 2015-05-28T13:30:00Z, End Timestamp: 2015-07-07T06:30:00Z, Start Row: 18000, End Row: 18750 Correlation: 0.79, Sign: positive, Start Timestamp: 2015-07-07T06:45:00Z, End Timestamp: 2015-08-14T11:30:00Z, Start Row: 18750, End Row: 19500 Correlation: 0.71, Sign: positive, Start Timestamp: 2015-08-14T11:45:00Z, End Timestamp: 2015-09-22T13:00:00Z, Start Row: 19500, End Row: 20250 Correlation: 0.82, Sign: positive, Start Timestamp: 2015-09-22T13:15:00Z, End Timestamp: 2015-11-09T09:30:00Z, Start Row: 20250, End Row: 21000 Correlation: 0.93, Sign: positive, Start Timestamp: 2015-11-09T09:45:00Z, End Timestamp: 2015-12-15T09:30:00Z, Start Row: 21000, End Row: 21750 Correlation: 0.18, Sign: positive, Start Timestamp: 2015-12-15T09:45:00Z, End Timestamp: 2016-01-20T08:15:00Z, Start Row: 21750, End Row: 22500 Correlation: 0.84, Sign: positive, Start Timestamp: 2016-01-20T08:30:00Z, End Timestamp: 2016-02-23T14:45:00Z, Start Row: 22500, End Row: 23250 Correlation: 0.84, Sign: positive, Start Timestamp: 2016-02-23T15:00:00Z, End Timestamp: 2016-03-29T10:45:00Z, Start Row: 23250, End Row: 24000 Correlation: 0.72, Sign: positive, Start Timestamp: 2016-03-29T11:00:00Z, End Timestamp: 2016-05-03T07:00:00Z, Start Row: 24000, End Row: 24750 Correlation: -0.37, Sign: negative, Start Timestamp: 2016-05-03T07:15:00Z, End Timestamp: 2016-06-07T13:30:00Z, Start Row: 24750, End Row: 25500 Correlation: 0.90, Sign: positive, Start Timestamp: 2016-06-07T13:45:00Z, End Timestamp: 2016-07-18T08:00:00Z, Start Row: 25500, End Row: 26250 Correlation: 0.77, Sign: positive, Start Timestamp: 2016-07-18T08:15:00Z, End Timestamp: 2016-08-19T14:30:00Z, Start Row: 26250, End Row: 27000 Correlation: 0.56, Sign: positive, Start Timestamp: 2016-08-22T06:30:00Z, End Timestamp: 2016-09-30T13:45:00Z, Start Row: 27000, End Row: 27750 Correlation: 0.25, Sign: positive, Start Timestamp: 2016-09-30T14:00:00Z, End Timestamp: 2016-11-07T07:45:00Z, Start Row: 27750, End Row: 28500 Correlation: 0.92, Sign: positive, Start Timestamp: 2016-11-07T08:00:00Z, End Timestamp: 2016-12-09T08:45:00Z, Start Row: 28500, End Row: 29250 Correlation: 0.30, Sign: positive, Start Timestamp: 2016-12-09T09:00:00Z, End Timestamp: 2017-01-12T08:15:00Z, Start Row: 29250, End Row: 30000 Correlation: 0.83, Sign: positive, Start Timestamp: 2017-01-12T08:30:00Z, End Timestamp: 2017-02-15T07:30:00Z, Start Row: 30000, End Row: 30750 Correlation: 0.08, Sign: positive, Start Timestamp: 2017-02-15T07:45:00Z, End Timestamp: 2017-03-21T07:30:00Z, Start Row: 30750, End Row: 31500 Correlation: 0.53, Sign: positive, Start Timestamp: 2017-03-21T07:45:00Z, End Timestamp: 2017-04-21T14:15:00Z, Start Row: 31500, End Row: 32250 Correlation: 0.52, Sign: positive, Start Timestamp: 2017-04-21T14:30:00Z, End Timestamp: 2017-05-29T12:30:00Z, Start Row: 32250, End Row: 33000 Correlation: 0.40, Sign: positive, Start Timestamp: 2017-05-29T12:45:00Z, End Timestamp: 2017-07-04T11:15:00Z, Start Row: 33000, End Row: 33750 Correlation: 0.89, Sign: positive, Start Timestamp: 2017-07-04T11:30:00Z, End Timestamp: 2017-08-07T08:30:00Z, Start Row: 33750, End Row: 34500 Correlation: 0.34, Sign: positive, Start Timestamp: 2017-08-07T08:45:00Z, End Timestamp: 2017-09-13T12:45:00Z, Start Row: 34500, End Row: 35250 Correlation: 0.88, Sign: positive, Start Timestamp: 2017-09-13T13:00:00Z, End Timestamp: 2017-10-17T12:00:00Z, Start Row: 35250, End Row: 36000 Correlation: 0.94, Sign: positive, Start Timestamp: 2017-10-17T12:15:00Z, End Timestamp: 2017-11-20T09:45:00Z, Start Row: 36000, End Row: 36750 Correlation: 0.65, Sign: positive, Start Timestamp: 2017-11-20T10:45:00Z, End Timestamp: 2017-12-22T08:00:00Z, Start Row: 36750, End Row: 37500 Correlation: 0.41, Sign: positive, Start Timestamp: 2017-12-22T08:15:00Z, End Timestamp: 2018-01-25T13:30:00Z, Start Row: 37500, End Row: 38250 Correlation: -0.15, Sign: negative, Start Timestamp: 2018-01-25T13:45:00Z, End Timestamp: 2018-02-28T12:30:00Z, Start Row: 38250, End Row: 39000 Correlation: -0.35, Sign: negative, Start Timestamp: 2018-02-28T12:45:00Z, End Timestamp: 2018-04-03T12:00:00Z, Start Row: 39000, End Row: 39750 Correlation: 0.29, Sign: positive, Start Timestamp: 2018-04-03T12:15:00Z, End Timestamp: 2018-05-09T09:15:00Z, Start Row: 39750, End Row: 40500 Correlation: -0.66, Sign: negative, Start Timestamp: 2018-05-09T09:30:00Z, End Timestamp: 2018-06-12T07:45:00Z, Start Row: 40500, End Row: 41250 Correlation: -0.04, Sign: negative, Start Timestamp: 2018-06-12T08:00:00Z, End Timestamp: 2018-07-17T10:45:00Z, Start Row: 41250, End Row: 42000 Correlation: -0.23, Sign: negative, Start Timestamp: 2018-07-17T11:00:00Z, End Timestamp: 2018-08-17T14:15:00Z, Start Row: 42000, End Row: 42750 Correlation: -0.30, Sign: negative, Start Timestamp: 2018-08-17T14:30:00Z, End Timestamp: 2018-09-28T09:30:00Z, Start Row: 42750, End Row: 43500 Correlation: 0.93, Sign: positive, Start Timestamp: 2018-09-28T09:45:00Z, End Timestamp: 2018-11-01T15:00:00Z, Start Row: 43500, End Row: 44250 Correlation: 0.76, Sign: positive, Start Timestamp: 2018-11-02T06:45:00Z, End Timestamp: 2018-12-05T11:30:00Z, Start Row: 44250, End Row: 45000 Correlation: 0.57, Sign: positive, Start Timestamp: 2018-12-05T11:45:00Z, End Timestamp: 2019-01-09T07:00:00Z, Start Row: 45000, End Row: 45750 Correlation: 0.85, Sign: positive, Start Timestamp: 2019-01-09T07:15:00Z, End Timestamp: 2019-02-11T11:15:00Z, Start Row: 45750, End Row: 46500 Correlation: 0.83, Sign: positive, Start Timestamp: 2019-02-11T11:30:00Z, End Timestamp: 2019-03-14T14:45:00Z, Start Row: 46500, End Row: 47250 Correlation: 0.17, Sign: positive, Start Timestamp: 2019-03-14T15:00:00Z, End Timestamp: 2019-04-17T10:00:00Z, Start Row: 47250, End Row: 48000 Correlation: 0.73, Sign: positive, Start Timestamp: 2019-04-17T10:45:00Z, End Timestamp: 2019-05-22T14:15:00Z, Start Row: 48000, End Row: 48750 Correlation: 0.90, Sign: positive, Start Timestamp: 2019-05-22T14:30:00Z, End Timestamp: 2019-06-28T15:00:00Z, Start Row: 48750, End Row: 49500
#I wanted to see those periods with negative correlation to see which one is increasing and decreasing.
# Define the row periods for negative correlations
row_periods = [(item["Start Row"], item["End Row"]) for item in correlations_with_timestamps if item["Correlation Sign"] == "negative"]
]
# Iterate through the row periods and create separate plots
for start_row, end_row in row_periods:
period_data = allticks_data[["TRKCM", "EREGL"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing
# Extract the data for each column
x = period_data["TRKCM"]
y = period_data["EREGL"]
# Create a new figure for each row period
plt.figure(figsize=(8, 6))
# Plot the data
plt.plot(x, label="TRKCM", color="blue")
plt.plot(y, label="EREGL", color="red")
# Set labels and title
plt.xlabel("Row")
plt.ylabel("Value")
plt.title(f"Rows {start_row}-{end_row}")
# Add a legend
plt.legend()
# Show the individual plot
plt.show()
I also wanted to examine TRKCM and EREGL since these two have the highest correlation. When I look at the periods when they have negative correlations, I saw that EREGLs stock prices are always higher than TRKCM and its fluctuations are more apparent from the plot. For example, in rows 8250-8500 they behaved in the same way but after the 8500th row (2014-01-06T14:15:00Z) EREGL starts to increase but TRKCM is declining. Correlation data, IMG5:[https://ibb.co/JsTQGMR]
#I selected these two because they had the second highest correlation. TRKCM and EREGL have the highest correlation but TRKCM doesnt't have Google Trends data for comparison.
# Select the specified columns
selected_columns = ["FROTO", "EREGL"]
# Initialize a list to store correlation values and corresponding timestamps for negative correlations
correlations_with_timestamps = []
# Calculate the correlation and create and display separate plots for each set of 750 rows
num_sets = len(allticks_data) // 750
for i in range(num_sets):
start_row = i * 750
end_row = (i + 1) * 750
period_data = allticks_data[selected_columns][start_row:end_row]
# Drop rows with missing values
period_data.dropna(inplace=True)
# Extract the data for each column
x = period_data["FROTO"] # Replacing "TRKCM" with "FROTO"
y = period_data["EREGL"]
# Calculate the correlation coefficient and correlation
correlation_coefficient = np.corrcoef(x, y)[0, 1]
correlation = np.corrcoef(x, y)[0, 1] # Calculate the correlation itself
# Get the start and end timestamps
start_timestamp = allticks_data["timestamp"].iloc[start_row]
end_timestamp = allticks_data["timestamp"].iloc[end_row - 1]
# Determine the correlation sign based on the correlation coefficient
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
# Create a scatter plot for the period only if the correlation is negative
if correlation_sign == "negative":
# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")
# Set labels and title
plt.xlabel("FROTO") # Adjusted for "FROTO"
plt.ylabel("EREGL")
plt.title(f"Scatter Plot for Rows {start_row+1}-{end_row}")
# Add text to display the correlation coefficient and correlation
plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
# Display the plot
plt.legend(loc="upper right")
plt.show()
# Add the correlation and timestamps to the list
correlations_with_timestamps.append({
"Correlation": correlation,
"Correlation Sign": correlation_sign,
"Start Timestamp": start_timestamp,
"End Timestamp": end_timestamp,
"Start Row": start_row,
"End Row": end_row
})
for item in correlations_with_timestamps:
print(f"Correlation: {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}, Start Timestamp: {item['Start Timestamp']}, End Timestamp: {item['End Timestamp']}, Start Row: {item['Start Row']}, End Row: {item['End Row']}")
Correlation: -0.09, Sign: negative, Start Timestamp: 2012-09-17T06:45:00Z, End Timestamp: 2012-10-31T10:00:00Z, Start Row: 0, End Row: 750 Correlation: 0.76, Sign: positive, Start Timestamp: 2012-10-31T10:15:00Z, End Timestamp: 2012-12-11T09:00:00Z, Start Row: 750, End Row: 1500 Correlation: 0.56, Sign: positive, Start Timestamp: 2012-12-11T09:15:00Z, End Timestamp: 2013-01-22T08:45:00Z, Start Row: 1500, End Row: 2250 Correlation: 0.80, Sign: positive, Start Timestamp: 2013-01-22T09:00:00Z, End Timestamp: 2013-03-04T07:45:00Z, Start Row: 2250, End Row: 3000 Correlation: 0.60, Sign: positive, Start Timestamp: 2013-03-04T08:00:00Z, End Timestamp: 2013-04-11T13:15:00Z, Start Row: 3000, End Row: 3750 Correlation: 0.94, Sign: positive, Start Timestamp: 2013-04-11T13:30:00Z, End Timestamp: 2013-05-23T11:45:00Z, Start Row: 3750, End Row: 4500 Correlation: 0.50, Sign: positive, Start Timestamp: 2013-05-23T12:00:00Z, End Timestamp: 2013-07-03T08:45:00Z, Start Row: 4500, End Row: 5250 Correlation: 0.65, Sign: positive, Start Timestamp: 2013-07-03T09:00:00Z, End Timestamp: 2013-08-16T08:00:00Z, Start Row: 5250, End Row: 6000 Correlation: 0.66, Sign: positive, Start Timestamp: 2013-08-16T08:15:00Z, End Timestamp: 2013-09-26T12:15:00Z, Start Row: 6000, End Row: 6750 Correlation: -0.25, Sign: negative, Start Timestamp: 2013-09-26T12:30:00Z, End Timestamp: 2013-11-13T13:00:00Z, Start Row: 6750, End Row: 7500 Correlation: 0.68, Sign: positive, Start Timestamp: 2013-11-13T13:15:00Z, End Timestamp: 2013-12-23T10:15:00Z, Start Row: 7500, End Row: 8250 Correlation: -0.64, Sign: negative, Start Timestamp: 2013-12-23T12:00:00Z, End Timestamp: 2014-01-31T10:00:00Z, Start Row: 8250, End Row: 9000 Correlation: 0.61, Sign: positive, Start Timestamp: 2014-01-31T10:15:00Z, End Timestamp: 2014-03-12T08:45:00Z, Start Row: 9000, End Row: 9750 Correlation: 0.83, Sign: positive, Start Timestamp: 2014-03-12T09:00:00Z, End Timestamp: 2014-04-21T12:15:00Z, Start Row: 9750, End Row: 10500 Correlation: 0.82, Sign: positive, Start Timestamp: 2014-04-21T12:30:00Z, End Timestamp: 2014-06-04T12:45:00Z, Start Row: 10500, End Row: 11250 Correlation: -0.24, Sign: negative, Start Timestamp: 2014-06-04T13:00:00Z, End Timestamp: 2014-07-14T08:00:00Z, Start Row: 11250, End Row: 12000 Correlation: 0.81, Sign: positive, Start Timestamp: 2014-07-14T08:15:00Z, End Timestamp: 2014-08-25T12:15:00Z, Start Row: 12000, End Row: 12750 Correlation: 0.06, Sign: positive, Start Timestamp: 2014-08-25T12:30:00Z, End Timestamp: 2014-10-01T13:00:00Z, Start Row: 12750, End Row: 13500 Correlation: 0.56, Sign: positive, Start Timestamp: 2014-10-01T13:15:00Z, End Timestamp: 2014-11-14T08:15:00Z, Start Row: 13500, End Row: 14250 Correlation: 0.23, Sign: positive, Start Timestamp: 2014-11-14T08:30:00Z, End Timestamp: 2014-12-23T08:30:00Z, Start Row: 14250, End Row: 15000 Correlation: -0.26, Sign: negative, Start Timestamp: 2014-12-23T08:45:00Z, End Timestamp: 2015-01-30T10:00:00Z, Start Row: 15000, End Row: 15750 Correlation: 0.24, Sign: positive, Start Timestamp: 2015-01-30T10:15:00Z, End Timestamp: 2015-03-10T13:30:00Z, Start Row: 15750, End Row: 16500 Correlation: 0.56, Sign: positive, Start Timestamp: 2015-03-10T13:45:00Z, End Timestamp: 2015-04-16T12:15:00Z, Start Row: 16500, End Row: 17250 Correlation: 0.85, Sign: positive, Start Timestamp: 2015-04-16T12:30:00Z, End Timestamp: 2015-05-28T13:15:00Z, Start Row: 17250, End Row: 18000 Correlation: 0.08, Sign: positive, Start Timestamp: 2015-05-28T13:30:00Z, End Timestamp: 2015-07-07T06:30:00Z, Start Row: 18000, End Row: 18750 Correlation: 0.50, Sign: positive, Start Timestamp: 2015-07-07T06:45:00Z, End Timestamp: 2015-08-14T11:30:00Z, Start Row: 18750, End Row: 19500 Correlation: 0.57, Sign: positive, Start Timestamp: 2015-08-14T11:45:00Z, End Timestamp: 2015-09-22T13:00:00Z, Start Row: 19500, End Row: 20250 Correlation: 0.78, Sign: positive, Start Timestamp: 2015-09-22T13:15:00Z, End Timestamp: 2015-11-09T09:30:00Z, Start Row: 20250, End Row: 21000 Correlation: 0.84, Sign: positive, Start Timestamp: 2015-11-09T09:45:00Z, End Timestamp: 2015-12-15T09:30:00Z, Start Row: 21000, End Row: 21750 Correlation: 0.83, Sign: positive, Start Timestamp: 2015-12-15T09:45:00Z, End Timestamp: 2016-01-20T08:15:00Z, Start Row: 21750, End Row: 22500 Correlation: 0.84, Sign: positive, Start Timestamp: 2016-01-20T08:30:00Z, End Timestamp: 2016-02-23T14:45:00Z, Start Row: 22500, End Row: 23250 Correlation: 0.89, Sign: positive, Start Timestamp: 2016-02-23T15:00:00Z, End Timestamp: 2016-03-29T10:45:00Z, Start Row: 23250, End Row: 24000 Correlation: 0.85, Sign: positive, Start Timestamp: 2016-03-29T11:00:00Z, End Timestamp: 2016-05-03T07:00:00Z, Start Row: 24000, End Row: 24750 Correlation: 0.47, Sign: positive, Start Timestamp: 2016-05-03T07:15:00Z, End Timestamp: 2016-06-07T13:30:00Z, Start Row: 24750, End Row: 25500 Correlation: 0.06, Sign: positive, Start Timestamp: 2016-06-07T13:45:00Z, End Timestamp: 2016-07-18T08:00:00Z, Start Row: 25500, End Row: 26250 Correlation: 0.80, Sign: positive, Start Timestamp: 2016-07-18T08:15:00Z, End Timestamp: 2016-08-19T14:30:00Z, Start Row: 26250, End Row: 27000 Correlation: 0.87, Sign: positive, Start Timestamp: 2016-08-22T06:30:00Z, End Timestamp: 2016-09-30T13:45:00Z, Start Row: 27000, End Row: 27750 Correlation: 0.70, Sign: positive, Start Timestamp: 2016-09-30T14:00:00Z, End Timestamp: 2016-11-07T07:45:00Z, Start Row: 27750, End Row: 28500 Correlation: -0.10, Sign: negative, Start Timestamp: 2016-11-07T08:00:00Z, End Timestamp: 2016-12-09T08:45:00Z, Start Row: 28500, End Row: 29250 Correlation: 0.45, Sign: positive, Start Timestamp: 2016-12-09T09:00:00Z, End Timestamp: 2017-01-12T08:15:00Z, Start Row: 29250, End Row: 30000 Correlation: 0.78, Sign: positive, Start Timestamp: 2017-01-12T08:30:00Z, End Timestamp: 2017-02-15T07:30:00Z, Start Row: 30000, End Row: 30750 Correlation: 0.51, Sign: positive, Start Timestamp: 2017-02-15T07:45:00Z, End Timestamp: 2017-03-21T07:30:00Z, Start Row: 30750, End Row: 31500 Correlation: 0.47, Sign: positive, Start Timestamp: 2017-03-21T07:45:00Z, End Timestamp: 2017-04-21T14:15:00Z, Start Row: 31500, End Row: 32250 Correlation: 0.50, Sign: positive, Start Timestamp: 2017-04-21T14:30:00Z, End Timestamp: 2017-05-29T12:30:00Z, Start Row: 32250, End Row: 33000 Correlation: 0.21, Sign: positive, Start Timestamp: 2017-05-29T12:45:00Z, End Timestamp: 2017-07-04T11:15:00Z, Start Row: 33000, End Row: 33750 Correlation: 0.86, Sign: positive, Start Timestamp: 2017-07-04T11:30:00Z, End Timestamp: 2017-08-07T08:30:00Z, Start Row: 33750, End Row: 34500 Correlation: 0.35, Sign: positive, Start Timestamp: 2017-08-07T08:45:00Z, End Timestamp: 2017-09-13T12:45:00Z, Start Row: 34500, End Row: 35250 Correlation: 0.39, Sign: positive, Start Timestamp: 2017-09-13T13:00:00Z, End Timestamp: 2017-10-17T12:00:00Z, Start Row: 35250, End Row: 36000 Correlation: 0.90, Sign: positive, Start Timestamp: 2017-10-17T12:15:00Z, End Timestamp: 2017-11-20T09:45:00Z, Start Row: 36000, End Row: 36750 Correlation: 0.51, Sign: positive, Start Timestamp: 2017-11-20T10:45:00Z, End Timestamp: 2017-12-22T08:00:00Z, Start Row: 36750, End Row: 37500 Correlation: 0.55, Sign: positive, Start Timestamp: 2017-12-22T08:15:00Z, End Timestamp: 2018-01-25T13:30:00Z, Start Row: 37500, End Row: 38250 Correlation: 0.50, Sign: positive, Start Timestamp: 2018-01-25T13:45:00Z, End Timestamp: 2018-02-28T12:30:00Z, Start Row: 38250, End Row: 39000 Correlation: 0.19, Sign: positive, Start Timestamp: 2018-02-28T12:45:00Z, End Timestamp: 2018-04-03T12:00:00Z, Start Row: 39000, End Row: 39750 Correlation: 0.08, Sign: positive, Start Timestamp: 2018-04-03T12:15:00Z, End Timestamp: 2018-05-09T09:15:00Z, Start Row: 39750, End Row: 40500 Correlation: 0.83, Sign: positive, Start Timestamp: 2018-05-09T09:30:00Z, End Timestamp: 2018-06-12T07:45:00Z, Start Row: 40500, End Row: 41250 Correlation: 0.20, Sign: positive, Start Timestamp: 2018-06-12T08:00:00Z, End Timestamp: 2018-07-17T10:45:00Z, Start Row: 41250, End Row: 42000 Correlation: 0.12, Sign: positive, Start Timestamp: 2018-07-17T11:00:00Z, End Timestamp: 2018-08-17T14:15:00Z, Start Row: 42000, End Row: 42750 Correlation: 0.25, Sign: positive, Start Timestamp: 2018-08-17T14:30:00Z, End Timestamp: 2018-09-28T09:30:00Z, Start Row: 42750, End Row: 43500 Correlation: 0.82, Sign: positive, Start Timestamp: 2018-09-28T09:45:00Z, End Timestamp: 2018-11-01T15:00:00Z, Start Row: 43500, End Row: 44250 Correlation: 0.57, Sign: positive, Start Timestamp: 2018-11-02T06:45:00Z, End Timestamp: 2018-12-05T11:30:00Z, Start Row: 44250, End Row: 45000 Correlation: 0.57, Sign: positive, Start Timestamp: 2018-12-05T11:45:00Z, End Timestamp: 2019-01-09T07:00:00Z, Start Row: 45000, End Row: 45750 Correlation: 0.69, Sign: positive, Start Timestamp: 2019-01-09T07:15:00Z, End Timestamp: 2019-02-11T11:15:00Z, Start Row: 45750, End Row: 46500 Correlation: 0.14, Sign: positive, Start Timestamp: 2019-02-11T11:30:00Z, End Timestamp: 2019-03-14T14:45:00Z, Start Row: 46500, End Row: 47250 Correlation: 0.66, Sign: positive, Start Timestamp: 2019-03-14T15:00:00Z, End Timestamp: 2019-04-17T10:00:00Z, Start Row: 47250, End Row: 48000 Correlation: 0.44, Sign: positive, Start Timestamp: 2019-04-17T10:45:00Z, End Timestamp: 2019-05-22T14:15:00Z, Start Row: 48000, End Row: 48750 Correlation: 0.80, Sign: positive, Start Timestamp: 2019-05-22T14:30:00Z, End Timestamp: 2019-06-28T15:00:00Z, Start Row: 48750, End Row: 49500
# Define the row periods for negative correlations
row_periods = [(item["Start Row"], item["End Row"]) for item in correlations_with_timestamps if item["Correlation Sign"] == "negative"]
# Iterate through the row periods with negative correlations and create separate plots
for start_row, end_row in row_periods:
period_data = allticks_data[["FROTO", "EREGL"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing and "FROTO" column
# Extract the data for each column
x = period_data["FROTO"]
y = period_data["EREGL"]
# Create a new figure for each row period
plt.figure(figsize=(8, 6))
# Plot the data
plt.plot(x, label="FROTO", color="blue")
plt.plot(y, label="EREGL", color="red")
# Set labels and title
plt.xlabel("Row")
plt.ylabel("Value")
plt.title(f"Rows {start_row}-{end_row}")
# Add a legend
plt.legend()
# Show the individual plot
plt.show()
I couldn’t find the stock related data of TRKCM in Google Trends. Thus I decided to try for FROTO and EREGL, since they have the second highest correlation. FROTO has much higher stock prices than EREGL, thus EREGL’s fluctuations aren’t seen clearly from the plots, yet it seems that when they had negative correlations; FROTO is decreasing and EREGL is increasing.
For example, in this period, FROTO seems to be decreasing and EREGL is increasing but FROTO’s stock prices are still much higher. Correlation data, IMG6: [https://ibb.co/tLmbQHy] Google Trends data, IMG7: [https://ibb.co/S56n5Fs]
In Google Trends, The average of the FROTO is higher meaning that it is searched more, butEREGL had 11 peaks while FROTO had 10 peaks. It indicates that EREGL has gained popularity in that period but FROTO is still dominating
####### PCA ANALYSIS
from sklearn.decomposition import PCA
# Drop the timestamp column
stock_data = allticks_data.drop(columns=['timestamp'])
# Create an imputer to fill missing values with the column mean
imputer = SimpleImputer(strategy='mean')
stock_data_imputed = imputer.fit_transform(stock_data)
# Standardize the data
scaler = StandardScaler()
stock_data_standardized = scaler.fit_transform(stock_data_imputed)
# Initialize and fit the PCA model
n_components = 60 # Number of principal components to retain ##i first selected the number of components as the original dimension
pca = PCA(n_components=n_components)
pca.fit(stock_data_standardized)
# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
# Create a table for component statistics
component_stats = pd.DataFrame({
'Component': range(1, len(explained_variance_ratio) + 1),
'Standard Deviation': np.sqrt(pca.explained_variance_),
'Proportion of Variance': pca.explained_variance_ratio_,
'Cumulative Proportion': np.cumsum(pca.explained_variance_ratio_)
})
# Display the component statistics table
print("Component Statistics:")
print(component_stats)
# Create a table for loadings
loadings = pd.DataFrame(pca.components_, columns=stock_data.columns)
# Display the loadings table
print("\nLoadings Table:")
print(loadings)
component_stats.to_excel('component_stats.xlsx',index=False)
loadings.to_excel('loadings.xlsx',index=False)
Component Statistics:
Component Standard Deviation Proportion of Variance \
0 1 5.411674 0.488094
1 2 3.196438 0.170283
2 3 2.536973 0.107268
3 4 1.730182 0.049891
4 5 1.511590 0.038081
5 6 1.133295 0.021406
6 7 0.958872 0.015324
7 8 0.892638 0.013280
8 9 0.783668 0.010235
9 10 0.729850 0.008878
10 11 0.707544 0.008343
11 12 0.653015 0.007107
12 13 0.611131 0.006225
13 14 0.597257 0.005945
14 15 0.536428 0.004796
15 16 0.522433 0.004549
16 17 0.456779 0.003477
17 18 0.430460 0.003088
18 19 0.397430 0.002632
19 20 0.395831 0.002611
20 21 0.365204 0.002223
21 22 0.351648 0.002061
22 23 0.330286 0.001818
23 24 0.320561 0.001713
24 25 0.312629 0.001629
25 26 0.285812 0.001361
26 27 0.278007 0.001288
27 28 0.266828 0.001187
28 29 0.259451 0.001122
29 30 0.253319 0.001069
30 31 0.240825 0.000967
31 32 0.236844 0.000935
32 33 0.221634 0.000819
33 34 0.218693 0.000797
34 35 0.210692 0.000740
35 36 0.203632 0.000691
36 37 0.198371 0.000656
37 38 0.192681 0.000619
38 39 0.190980 0.000608
39 40 0.182159 0.000553
40 41 0.168839 0.000475
41 42 0.166813 0.000464
42 43 0.160176 0.000428
43 44 0.159183 0.000422
44 45 0.154896 0.000400
45 46 0.148436 0.000367
46 47 0.145597 0.000353
47 48 0.141189 0.000332
48 49 0.137005 0.000313
49 50 0.130059 0.000282
50 51 0.123291 0.000253
51 52 0.116004 0.000224
52 53 0.114420 0.000218
53 54 0.112003 0.000209
54 55 0.104784 0.000183
55 56 0.099252 0.000164
56 57 0.095562 0.000152
57 58 0.093453 0.000146
58 59 0.092427 0.000142
59 60 0.078555 0.000103
Cumulative Proportion
0 0.488094
1 0.658377
2 0.765646
3 0.815537
4 0.853618
5 0.875023
6 0.890347
7 0.903627
8 0.913862
9 0.922740
10 0.931083
11 0.938190
12 0.944415
13 0.950360
14 0.955156
15 0.959705
16 0.963182
17 0.966270
18 0.968903
19 0.971514
20 0.973737
21 0.975798
22 0.977616
23 0.979329
24 0.980958
25 0.982319
26 0.983607
27 0.984794
28 0.985916
29 0.986985
30 0.987952
31 0.988887
32 0.989705
33 0.990502
34 0.991242
35 0.991933
36 0.992589
37 0.993208
38 0.993816
39 0.994369
40 0.994844
41 0.995308
42 0.995735
43 0.996158
44 0.996557
45 0.996925
46 0.997278
47 0.997610
48 0.997923
49 0.998205
50 0.998458
51 0.998683
52 0.998901
53 0.999110
54 0.999293
55 0.999457
56 0.999609
57 0.999755
58 0.999897
59 1.000000
Loadings Table:
AEFES AKBNK AKSA AKSEN ALARK ALBRK ANACM \
0 -0.010619 -0.115330 -0.162991 -0.074086 -0.139545 -0.008576 -0.168831
1 -0.164429 -0.212004 -0.033005 -0.159788 -0.081720 -0.112583 0.058363
2 -0.261382 0.073484 0.068234 -0.147875 -0.106180 -0.203374 -0.094810
3 0.014416 -0.094138 0.076323 -0.196707 -0.158188 0.248636 -0.086114
4 0.015394 -0.105984 0.066446 0.236019 -0.223156 -0.122610 -0.011328
5 -0.018140 0.085660 -0.180828 0.241494 -0.126733 0.140711 0.105549
6 0.075174 0.108106 -0.199153 0.119555 -0.150210 0.406538 0.056822
7 0.059581 -0.046841 -0.060453 -0.042819 -0.004839 0.053875 -0.000587
8 -0.114302 -0.051819 -0.063405 -0.069009 0.026365 0.118118 -0.062160
9 -0.192159 0.000015 -0.083635 -0.118536 0.117608 0.089603 -0.042360
10 0.455544 0.108440 -0.128275 -0.144293 -0.036502 -0.476641 -0.049088
11 0.273656 0.002982 0.064819 -0.092101 -0.016472 0.160483 -0.059559
12 0.102158 -0.147034 0.080613 0.008406 -0.040919 -0.093194 0.010223
13 0.363439 -0.060055 0.171371 0.018025 -0.014562 0.263588 0.050178
14 0.017763 0.077017 0.062910 -0.083708 -0.146870 -0.047852 -0.063514
15 0.290299 -0.137708 0.037418 0.205699 0.118733 -0.117275 0.072106
16 0.157388 -0.010414 0.045018 -0.348017 -0.066540 -0.061797 -0.068713
17 0.184640 -0.024658 -0.146063 -0.056727 -0.054352 -0.154460 0.028410
18 -0.192447 0.055082 0.059132 -0.107376 0.020367 -0.011428 0.022881
19 -0.042429 0.117334 0.095559 -0.024542 -0.293436 -0.099039 0.007348
20 0.106945 -0.028529 0.155411 0.254699 -0.015594 -0.222860 0.068526
21 0.108365 0.017511 0.057691 -0.124020 0.091019 0.143323 0.078801
22 -0.030401 -0.007885 -0.158598 0.174079 -0.054509 -0.276084 0.047905
23 -0.162112 0.048563 0.152288 0.151761 0.124432 -0.166273 -0.010833
24 0.114655 -0.065577 0.026070 -0.206914 0.120101 0.031381 0.100395
25 0.033088 -0.100286 -0.058682 0.178338 0.127454 0.120163 0.043167
26 0.014247 -0.036817 -0.258493 0.164146 -0.008177 0.094029 -0.048428
27 0.264538 0.143681 -0.090983 -0.134268 0.017295 0.050808 0.165177
28 -0.071104 0.001624 -0.061773 -0.209436 0.047917 -0.157911 0.014677
29 0.044157 0.032685 -0.102867 -0.205007 -0.124667 0.110479 -0.157199
30 0.005159 0.024566 0.036919 0.051664 0.112565 0.017934 -0.097106
31 -0.122877 -0.042648 -0.252617 0.018889 0.007359 -0.099293 -0.040859
32 0.024959 -0.006291 -0.000861 0.097524 0.187282 0.011025 -0.048922
33 0.009566 -0.119572 0.011320 0.077451 0.131781 -0.013956 0.008587
34 -0.138435 0.204523 -0.142875 -0.182783 0.131051 -0.014408 -0.020010
35 0.010043 0.147489 -0.115384 0.153281 0.205369 -0.005152 0.054440
36 -0.013053 -0.044556 0.180746 -0.058914 0.063750 0.101186 0.138202
37 0.003064 -0.059021 -0.333295 -0.114103 -0.067133 -0.029699 0.051244
38 0.005301 -0.012623 0.270708 -0.188178 0.167777 0.002485 -0.011387
39 -0.124796 -0.110234 -0.016340 0.019319 -0.011862 -0.053443 0.077511
40 0.080982 0.093244 -0.112591 -0.079579 0.075012 -0.012072 0.012028
41 -0.010838 -0.000406 -0.038418 0.102599 -0.295082 0.033703 -0.034795
42 -0.151969 0.038086 0.143270 -0.114462 -0.131161 0.020794 0.297013
43 -0.028292 -0.190484 -0.298549 -0.097249 0.101027 -0.007354 0.106816
44 0.016459 -0.176366 -0.073014 -0.074395 0.471003 -0.023546 -0.139236
45 0.027183 -0.132076 -0.234859 -0.060666 0.053379 0.028489 -0.140251
46 -0.026839 -0.066351 0.125087 -0.066543 0.001399 -0.014754 0.333992
47 -0.060087 -0.015580 -0.043032 0.112853 0.114545 0.062067 -0.151341
48 -0.026478 0.008901 0.109863 0.049175 0.118161 -0.002270 -0.283479
49 0.005352 -0.189703 -0.151468 -0.064979 -0.144190 -0.026037 0.259114
50 0.034925 -0.008133 -0.000400 0.012034 -0.107193 -0.004784 -0.285492
51 0.028093 -0.241729 0.082577 -0.002079 -0.145300 0.016448 -0.416547
52 0.027884 -0.130415 -0.033874 -0.017833 -0.035384 0.034810 -0.081980
53 0.007259 0.250916 -0.041261 -0.020594 0.145511 -0.003784 0.098696
54 -0.050117 -0.033406 -0.018522 -0.042330 0.086692 -0.034753 0.122512
55 0.040288 0.034894 0.075279 0.014791 0.016482 -0.029701 -0.123787
56 0.027306 0.302790 -0.044487 0.020807 0.077065 0.001640 -0.159534
57 -0.016781 -0.509135 0.104882 0.008200 0.048031 -0.003710 0.020242
58 0.001090 -0.215202 -0.126375 -0.013029 0.009917 -0.003117 0.159936
59 0.000757 0.012502 0.072984 -0.034331 -0.021212 -0.003769 0.088067
ARCLK ASELS ASUZU ... TTKOM TUKAS TUPRS USAK \
0 -0.124655 -0.176649 -0.152185 ... 0.011291 -0.115707 -0.169414 -0.130355
1 -0.032705 0.003778 -0.088927 ... -0.268609 0.065278 0.096402 -0.142497
2 0.251670 0.005959 -0.127121 ... 0.110387 0.057695 -0.011123 0.079051
3 -0.038360 -0.090251 0.095978 ... 0.114039 0.104382 -0.033534 -0.177370
4 -0.123666 0.063168 0.098447 ... -0.066522 -0.320755 -0.067071 0.091815
5 -0.161080 -0.056906 -0.020432 ... 0.010542 0.263910 0.089874 -0.001966
6 0.067813 -0.024458 -0.065776 ... -0.114462 -0.278942 0.020599 -0.153688
7 -0.104345 0.113283 0.051435 ... -0.082362 0.031030 0.046583 0.016580
8 0.006664 0.119906 -0.160681 ... -0.133465 0.031005 0.147925 0.228954
9 0.023125 -0.034343 -0.008719 ... -0.074891 0.012079 -0.061714 -0.067155
10 -0.055180 -0.030730 0.008656 ... -0.088462 -0.140438 -0.034052 -0.196688
11 0.066094 0.069978 0.084270 ... 0.068673 -0.027331 -0.008766 -0.088879
12 0.048971 -0.019740 -0.021615 ... 0.030224 -0.064646 0.000524 0.143203
13 0.103854 -0.057010 -0.085996 ... 0.020913 0.173693 0.031591 -0.059036
14 0.031915 -0.109372 -0.097648 ... 0.319629 0.357978 0.050767 0.126956
15 -0.129523 0.034016 0.137257 ... -0.095622 0.240360 0.047355 0.023699
16 -0.028427 -0.021454 0.097923 ... -0.259315 0.100060 -0.011701 0.157115
17 0.050730 0.033792 -0.211358 ... 0.259063 0.034795 0.018639 -0.133439
18 -0.126534 -0.019365 -0.040688 ... 0.180777 -0.081383 -0.011710 -0.076632
19 0.039728 -0.060765 -0.135635 ... -0.004601 -0.194376 -0.069111 0.180816
20 0.005931 -0.096832 0.109330 ... -0.022625 0.083732 -0.090806 0.058641
21 -0.037471 -0.009767 0.136786 ... 0.065938 -0.203248 -0.055833 0.083576
22 -0.010573 0.038690 -0.066436 ... -0.016891 0.076976 0.022905 -0.251359
23 0.067526 0.064384 -0.028129 ... -0.125857 0.238077 -0.066952 -0.150650
24 -0.026919 0.113481 -0.147130 ... 0.111072 -0.044124 -0.099316 0.072469
25 -0.090686 -0.037084 -0.067807 ... 0.217019 -0.042028 -0.016617 -0.029907
26 0.061717 0.097531 0.126139 ... 0.139385 0.276400 -0.097761 0.100972
27 0.106050 -0.022816 0.001448 ... 0.163660 -0.033907 -0.080165 0.275601
28 -0.185517 0.055055 0.037534 ... 0.173920 -0.011963 0.164670 0.101822
29 -0.048602 0.008294 -0.010636 ... -0.069431 0.142148 0.101402 -0.091956
30 -0.081624 0.163102 0.307790 ... -0.191309 -0.075409 -0.013078 -0.068913
31 0.016432 -0.024424 0.360749 ... 0.389031 -0.219247 -0.009343 0.063870
32 0.196253 -0.139004 0.285699 ... -0.050296 0.068673 -0.014651 0.129046
33 -0.182715 -0.048886 -0.025525 ... 0.086549 -0.168292 0.078617 0.025893
34 0.095516 0.062422 0.192874 ... -0.082725 0.000227 -0.146867 -0.183764
35 0.042084 0.034378 -0.235333 ... 0.118004 -0.190314 0.030513 -0.133813
36 -0.167164 0.067171 0.081514 ... -0.031567 -0.038715 -0.043978 -0.186232
37 0.059351 -0.042778 0.278041 ... -0.162018 0.039971 -0.072167 0.118201
38 -0.045813 -0.214241 0.144298 ... 0.200754 0.060305 -0.052715 -0.368025
39 -0.026885 -0.004904 0.118536 ... 0.183545 0.144384 -0.113019 0.061747
40 -0.414448 0.102412 -0.190054 ... -0.037817 0.103095 -0.171996 0.260250
41 -0.009521 -0.030462 -0.075652 ... -0.063382 0.013423 -0.116174 -0.031475
42 -0.196923 -0.087439 0.160789 ... -0.074668 0.005044 0.047029 0.188417
43 -0.194654 -0.051763 -0.040742 ... -0.035144 0.094323 0.087288 -0.118622
44 0.247694 -0.146731 -0.203355 ... -0.126526 -0.097950 -0.049801 0.175830
45 -0.181327 0.012670 -0.033894 ... 0.007198 0.054593 -0.069217 -0.038636
46 -0.068321 0.179454 -0.073943 ... -0.029118 -0.058548 -0.081229 -0.074020
47 -0.039079 -0.148131 -0.137673 ... -0.124314 0.059853 -0.270345 0.044812
48 -0.275193 0.037574 0.091101 ... 0.095810 -0.075606 0.297845 0.107232
49 0.340168 -0.012469 0.073329 ... -0.023999 -0.015899 0.095437 -0.057028
50 0.075673 -0.075572 0.100334 ... 0.067682 0.014641 -0.009953 -0.050865
51 0.006275 0.472322 -0.003302 ... 0.055476 -0.019919 0.045956 -0.040336
52 0.101918 0.119728 0.100622 ... 0.028013 -0.027735 -0.187413 0.076840
53 0.165297 0.565882 -0.013550 ... 0.048270 0.070423 0.106546 -0.022905
54 -0.066018 -0.044682 -0.028647 ... 0.014358 0.033076 -0.116004 0.009319
55 0.101909 -0.196163 -0.070056 ... -0.090041 -0.042800 0.431337 0.084701
56 -0.068488 -0.170296 0.068620 ... -0.064024 -0.002967 0.097151 -0.018809
57 0.080839 0.028037 -0.050006 ... 0.049323 -0.029921 -0.123425 0.035497
58 0.033552 -0.055824 0.028844 ... 0.025924 -0.024820 0.514393 -0.041002
59 -0.005886 0.107141 0.025298 ... -0.009675 -0.012164 0.023458 -0.011096
VAKBN VESTL YATAS YKBNK YUNSA ZOREN
0 -0.097293 -0.157651 -0.168918 0.037678 -0.087511 -0.131610
1 -0.249343 0.035116 -0.017233 -0.279360 -0.000860 -0.097437
2 0.022534 0.082714 -0.071226 -0.055424 -0.267015 0.069573
3 -0.075411 0.135683 -0.045575 0.010490 0.188490 0.207352
4 -0.102075 -0.027652 0.004609 -0.123473 0.161817 0.143059
5 0.020483 0.136444 -0.026446 0.099011 -0.023845 0.112381
6 0.105345 -0.001731 -0.106294 0.066752 0.058542 0.065990
7 -0.040193 -0.088092 0.114997 -0.123584 -0.105593 0.059178
8 -0.037669 -0.195789 -0.140023 -0.068553 -0.052067 -0.184885
9 0.002835 0.246614 0.020698 0.151362 -0.094010 0.243209
10 -0.040833 0.073624 0.050855 0.156392 -0.009638 -0.007291
11 0.041417 -0.052017 0.258278 -0.087463 -0.338253 0.012136
12 0.036069 0.114764 0.046831 -0.029281 0.177640 -0.072637
13 -0.018914 -0.133763 -0.030164 -0.080943 -0.048286 -0.304335
14 -0.002722 0.077823 0.014745 0.090558 0.059591 -0.042285
15 -0.104422 0.033216 -0.135501 -0.037410 0.062810 -0.161058
16 0.078515 0.221362 0.103737 0.035183 0.024489 -0.155844
17 -0.067043 -0.030218 0.016484 -0.162123 0.024618 0.292681
18 0.004022 -0.171897 0.194950 0.056921 0.397113 -0.296935
19 0.101843 -0.303697 0.079532 -0.118529 -0.103109 -0.059768
20 0.020304 -0.037174 -0.209290 -0.156873 -0.203302 0.260174
21 -0.084727 0.086178 -0.015370 -0.004374 -0.212708 -0.085157
22 0.052818 -0.008955 0.066427 -0.200032 -0.018244 -0.214457
23 0.061629 -0.034048 0.107109 0.113638 0.195566 -0.023223
24 -0.120516 -0.061378 0.139224 -0.147479 0.378110 0.239333
25 -0.142125 -0.036263 0.095360 -0.068822 -0.112429 -0.138734
26 -0.008243 -0.190749 0.181972 -0.045117 -0.074120 -0.020122
27 -0.007852 0.110178 -0.315156 -0.050448 0.199474 -0.025504
28 -0.078951 -0.182567 -0.044970 0.188885 -0.101274 0.231323
29 0.014295 -0.065630 0.086595 -0.243597 0.084445 0.102730
30 0.061431 -0.182519 -0.235187 0.140675 0.162455 0.116428
31 0.091447 0.099113 -0.195808 -0.116719 -0.036282 -0.201761
32 0.163081 -0.090998 0.157363 -0.149370 0.179985 0.171750
33 0.081526 0.287324 0.386418 0.174639 -0.033254 -0.106233
34 -0.094853 0.143703 -0.117339 -0.143349 0.025409 -0.254484
35 0.127179 -0.005158 0.176035 -0.134187 0.017659 0.045184
36 -0.037365 0.149862 0.086180 -0.112522 -0.238988 0.080567
37 0.078318 -0.236720 0.358247 0.083786 -0.044651 0.037078
38 0.225834 -0.216257 -0.092689 -0.039684 -0.058540 0.024330
39 0.026749 0.142367 0.036135 0.013533 -0.046625 -0.039870
40 0.190792 -0.018509 -0.024903 0.157025 -0.095567 -0.036821
41 0.160414 0.349410 0.090798 -0.205143 0.079729 0.044719
42 0.034564 -0.031061 0.056238 -0.275459 -0.036847 0.019721
43 0.066393 -0.139757 -0.112957 -0.066630 -0.041057 0.102084
44 -0.023963 0.020909 0.004391 -0.125744 -0.053889 0.056183
45 0.291364 0.113996 -0.140177 -0.172513 0.083299 0.003329
46 0.156611 -0.024379 -0.049944 0.194328 -0.042631 -0.010570
47 -0.229786 0.053449 -0.075186 0.128416 -0.043417 0.014165
48 -0.265252 0.102687 0.076005 -0.103218 0.000829 0.038499
49 -0.035370 0.045991 0.054114 0.272486 -0.011652 0.003103
50 -0.210054 -0.070880 0.062943 0.178426 0.003477 -0.022193
51 0.287339 0.002066 -0.061784 0.060832 -0.032127 -0.025123
52 -0.099243 0.099843 0.021899 -0.099763 -0.031748 -0.013786
53 -0.039708 0.066412 -0.014855 -0.084461 -0.026296 0.015411
54 0.263428 0.049992 -0.028897 -0.048390 -0.014019 -0.041728
55 0.346039 0.036969 -0.052743 -0.003505 -0.009637 0.002799
56 0.125464 0.051240 0.040458 -0.094102 -0.000452 -0.032783
57 0.154996 -0.001366 -0.011249 0.003733 0.032426 -0.012475
58 -0.066220 0.063381 -0.092379 0.058802 0.024252 -0.039279
59 -0.037163 0.021461 -0.013667 -0.045754 -0.003362 -0.057624
[60 rows x 60 columns]
--------------------------------------------------------------------------- PermissionError Traceback (most recent call last) Cell In[53], line 48 45 print("\nLoadings Table:") 46 print(loadings) ---> 48 component_stats.to_excel('component_stats.xlsx',index=False) 49 loadings.to_excel('loadings.xlsx',index=False) File ~\anaconda3\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File ~\anaconda3\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2374, in NDFrame.to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep, verbose, freeze_panes, storage_options) 2361 from pandas.io.formats.excel import ExcelFormatter 2363 formatter = ExcelFormatter( 2364 df, 2365 na_rep=na_rep, (...) 2372 inf_rep=inf_rep, 2373 ) -> 2374 formatter.write( 2375 excel_writer, 2376 sheet_name=sheet_name, 2377 startrow=startrow, 2378 startcol=startcol, 2379 freeze_panes=freeze_panes, 2380 engine=engine, 2381 storage_options=storage_options, 2382 ) File ~\anaconda3\lib\site-packages\pandas\io\formats\excel.py:944, in ExcelFormatter.write(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options) 940 need_save = False 941 else: 942 # error: Cannot instantiate abstract class 'ExcelWriter' with abstract 943 # attributes 'engine', 'save', 'supported_extensions' and 'write_cells' --> 944 writer = ExcelWriter( # type: ignore[abstract] 945 writer, engine=engine, storage_options=storage_options 946 ) 947 need_save = True 949 try: File ~\anaconda3\lib\site-packages\pandas\io\excel\_openpyxl.py:60, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs) 56 from openpyxl.workbook import Workbook 58 engine_kwargs = combine_kwargs(engine_kwargs, kwargs) ---> 60 super().__init__( 61 path, 62 mode=mode, 63 storage_options=storage_options, 64 if_sheet_exists=if_sheet_exists, 65 engine_kwargs=engine_kwargs, 66 ) 68 # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from 69 # the file and later write to it 70 if "r+" in self._mode: # Load from existing workbook File ~\anaconda3\lib\site-packages\pandas\io\excel\_base.py:1313, in ExcelWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs) 1309 self._handles = IOHandles( 1310 cast(IO[bytes], path), compression={"compression": None} 1311 ) 1312 if not isinstance(path, ExcelWriter): -> 1313 self._handles = get_handle( 1314 path, mode, storage_options=storage_options, is_text=False 1315 ) 1316 self._cur_sheet = None 1318 if date_format is None: File ~\anaconda3\lib\site-packages\pandas\io\common.py:865, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 856 handle = open( 857 handle, 858 ioargs.mode, (...) 861 newline="", 862 ) 863 else: 864 # Binary mode --> 865 handle = open(handle, ioargs.mode) 866 handles.append(handle) 868 # Convert BytesIO or file objects passed with an encoding PermissionError: [Errno 13] Permission denied: 'component_stats.xlsx'
##to see the companies that behave in the most similar way
# 'n_components' is the total number of components
# Create a dictionary to store the counts of same-sign numbers for pairs of companies
similar_sign_counts = {}
for component_number in range(n_components):
# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]
# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index
# Update the dictionary with positive correlations
for i, company1 in enumerate(positive_correlations):
for j, company2 in enumerate(positive_correlations):
if i != j:
pair = (company1, company2)
if pair not in similar_sign_counts:
similar_sign_counts[pair] = 0
similar_sign_counts[pair] += 1
# Update the dictionary with negative correlations
for i, company1 in enumerate(negative_correlations):
for j, company2 in enumerate(negative_correlations):
if i != j:
pair = (company1, company2)
if pair not in similar_sign_counts:
similar_sign_counts[pair] = 0
similar_sign_counts[pair] += 1
# Find the pairs of companies with the most common same-sign numbers
sorted_pairs = sorted(similar_sign_counts.items(), key=lambda x: x[1], reverse=True)
# Print the first 30 odd-numbered pairs and their common same-sign count #since it lists the same pair twice
print("First 30 odd-numbered pairs of companies with the most common same-sign numbers:")
for i, (pair, count) in enumerate(sorted_pairs[:60], start=1):
if i % 2 != 0:
print(f"Pair: {pair}, Common Same-Sign Count: {count}")
First 30 odd-numbered pairs of companies with the most common same-sign numbers:
Pair: ('EREGL', 'KRDMD'), Common Same-Sign Count: 47
Pair: ('ANACM', 'SISE'), Common Same-Sign Count: 41
Pair: ('KCHOL', 'TUPRS'), Common Same-Sign Count: 41
Pair: ('TKFEN', 'ASELS'), Common Same-Sign Count: 41
Pair: ('BAGFS', 'TTKOM'), Common Same-Sign Count: 40
Pair: ('AYGAZ', 'USAK'), Common Same-Sign Count: 40
Pair: ('KCHOL', 'SODA'), Common Same-Sign Count: 40
Pair: ('SAHOL', 'OTKAR'), Common Same-Sign Count: 40
Pair: ('SODA', 'KCHOL'), Common Same-Sign Count: 40
Pair: ('SODA', 'TTKOM'), Common Same-Sign Count: 40
Pair: ('ANACM', 'SODA'), Common Same-Sign Count: 39
Pair: ('AYGAZ', 'ECILC'), Common Same-Sign Count: 39
Pair: ('ECILC', 'AYGAZ'), Common Same-Sign Count: 39
Pair: ('GOODY', 'ISCTR'), Common Same-Sign Count: 39
Pair: ('KCHOL', 'FROTO'), Common Same-Sign Count: 39
Pair: ('SODA', 'ANACM'), Common Same-Sign Count: 39
Pair: ('AKBNK', 'GARAN'), Common Same-Sign Count: 38
Pair: ('ALBRK', 'BANVT'), Common Same-Sign Count: 38
Pair: ('ANACM', 'TUPRS'), Common Same-Sign Count: 38
Pair: ('CEMAS', 'SAHOL'), Common Same-Sign Count: 38
Pair: ('GARAN', 'ISDMR'), Common Same-Sign Count: 38
Pair: ('ISCTR', 'VAKBN'), Common Same-Sign Count: 38
Pair: ('KAREL', 'PRKME'), Common Same-Sign Count: 38
Pair: ('PRKME', 'KAREL'), Common Same-Sign Count: 38
Pair: ('SASA', 'THYAO'), Common Same-Sign Count: 38
Pair: ('THYAO', 'PETKM'), Common Same-Sign Count: 38
Pair: ('THYAO', 'TUKAS'), Common Same-Sign Count: 38
Pair: ('TRKCM', 'TKFEN'), Common Same-Sign Count: 38
Pair: ('TUKAS', 'THYAO'), Common Same-Sign Count: 38
Pair: ('VAKBN', 'ISCTR'), Common Same-Sign Count: 38
#### from the loadings table, it can be seen that 14th component covers 95% of the variance.
#I made the PCA analysis one more time with 14 components.
####### PCA ANALYSIS
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
# Assuming your DataFrame is named "allticks_data" with the stock prices
# Drop the timestamp column
stock_data = allticks_data.drop(columns=['timestamp'])
# Create an imputer to fill missing values with the column mean
imputer = SimpleImputer(strategy='mean')
stock_data_imputed = imputer.fit_transform(stock_data)
# Standardize the data
scaler = StandardScaler()
stock_data_standardized = scaler.fit_transform(stock_data_imputed)
# Initialize and fit the PCA model
n_components = 14 # Number of principal components to retain
pca = PCA(n_components=n_components)
pca.fit(stock_data_standardized)
# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
# Create a table for component statistics
component_stats = pd.DataFrame({
'Component': range(1, len(explained_variance_ratio) + 1),
'Standard Deviation': np.sqrt(pca.explained_variance_),
'Proportion of Variance': pca.explained_variance_ratio_,
'Cumulative Proportion': np.cumsum(pca.explained_variance_ratio_)
})
# Display the component statistics table
print("Component Statistics:")
print(component_stats)
# Create a table for loadings
loadings = pd.DataFrame(pca.components_, columns=stock_data.columns)
# Display the loadings table
print("\nLoadings Table:")
print(loadings)
component_stats.to_excel('component_stats2.xlsx',index=False)
loadings.to_excel('loadings2.xlsx',index=False)
Component Statistics:
Component Standard Deviation Proportion of Variance \
0 1 5.411674 0.488094
1 2 3.196438 0.170283
2 3 2.536973 0.107268
3 4 1.730182 0.049891
4 5 1.511590 0.038081
5 6 1.133295 0.021406
6 7 0.958872 0.015324
7 8 0.892638 0.013280
8 9 0.783668 0.010235
9 10 0.729850 0.008878
10 11 0.707544 0.008343
11 12 0.653014 0.007107
12 13 0.611126 0.006224
13 14 0.597243 0.005945
Cumulative Proportion
0 0.488094
1 0.658377
2 0.765646
3 0.815537
4 0.853618
5 0.875023
6 0.890347
7 0.903627
8 0.913862
9 0.922740
10 0.931083
11 0.938190
12 0.944415
13 0.950360
Loadings Table:
AEFES AKBNK AKSA AKSEN ALARK ALBRK ANACM \
0 -0.010619 -0.115330 -0.162991 -0.074086 -0.139545 -0.008576 -0.168831
1 -0.164429 -0.212004 -0.033005 -0.159788 -0.081720 -0.112583 0.058363
2 -0.261382 0.073484 0.068234 -0.147875 -0.106180 -0.203374 -0.094810
3 0.014416 -0.094138 0.076323 -0.196707 -0.158188 0.248636 -0.086114
4 0.015394 -0.105984 0.066446 0.236019 -0.223156 -0.122610 -0.011328
5 -0.018139 0.085660 -0.180828 0.241495 -0.126733 0.140710 0.105549
6 0.075172 0.108106 -0.199154 0.119552 -0.150211 0.406539 0.056820
7 0.059575 -0.046843 -0.060446 -0.042836 -0.004831 0.053880 -0.000583
8 -0.114284 -0.051815 -0.063431 -0.068982 0.026342 0.118115 -0.062175
9 -0.192200 0.000007 -0.083598 -0.118614 0.117635 0.089625 -0.042355
10 0.455559 0.108448 -0.128295 -0.144293 -0.036526 -0.476646 -0.049103
11 0.273705 0.003007 0.064817 -0.091986 -0.016509 0.160398 -0.059566
12 0.101925 -0.147138 0.080896 0.007856 -0.040817 -0.092892 0.010144
13 0.364150 -0.059971 0.170873 0.019021 -0.014857 0.263476 0.050102
ARCLK ASELS ASUZU ... TTKOM TUKAS TUPRS USAK \
0 -0.124655 -0.176649 -0.152185 ... 0.011291 -0.115707 -0.169414 -0.130355
1 -0.032705 0.003778 -0.088927 ... -0.268609 0.065278 0.096402 -0.142497
2 0.251670 0.005959 -0.127121 ... 0.110387 0.057695 -0.011123 0.079051
3 -0.038360 -0.090251 0.095978 ... 0.114039 0.104382 -0.033534 -0.177370
4 -0.123666 0.063168 0.098447 ... -0.066522 -0.320755 -0.067071 0.091815
5 -0.161081 -0.056907 -0.020431 ... 0.010542 0.263910 0.089874 -0.001966
6 0.067813 -0.024457 -0.065777 ... -0.114462 -0.278941 0.020598 -0.153688
7 -0.104347 0.113289 0.051419 ... -0.082354 0.031025 0.046581 0.016583
8 0.006667 0.119897 -0.160641 ... -0.133479 0.031025 0.147930 0.228955
9 0.023116 -0.034316 -0.008794 ... -0.074870 0.012061 -0.061723 -0.067156
10 -0.055182 -0.030732 0.008691 ... -0.088477 -0.140426 -0.034041 -0.196680
11 0.066105 0.069923 0.084401 ... 0.068581 -0.027344 -0.008731 -0.088901
12 0.048877 -0.019558 -0.022105 ... 0.030389 -0.064620 0.000541 0.143114
13 0.103948 -0.057326 -0.085095 ... 0.020563 0.174027 0.031630 -0.059159
VAKBN VESTL YATAS YKBNK YUNSA ZOREN
0 -0.097293 -0.157651 -0.168918 0.037678 -0.087511 -0.131610
1 -0.249343 0.035116 -0.017233 -0.279360 -0.000860 -0.097437
2 0.022534 0.082714 -0.071226 -0.055424 -0.267015 0.069573
3 -0.075411 0.135683 -0.045575 0.010490 0.188490 0.207352
4 -0.102075 -0.027652 0.004609 -0.123473 0.161818 0.143059
5 0.020483 0.136444 -0.026447 0.099011 -0.023847 0.112380
6 0.105346 -0.001733 -0.106291 0.066751 0.058548 0.065992
7 -0.040198 -0.088100 0.115011 -0.123578 -0.105553 0.059185
8 -0.037660 -0.195783 -0.140053 -0.068580 -0.052145 -0.184886
9 0.002821 0.246574 0.020774 0.151392 -0.093817 0.243242
10 -0.040823 0.073622 0.050823 0.156377 -0.009686 -0.007276
11 0.041462 -0.051917 0.258117 -0.087461 -0.338558 0.012085
12 0.035977 0.114288 0.047358 -0.029182 0.178952 -0.072451
13 -0.018846 -0.133331 -0.031088 -0.081587 -0.050757 -0.304658
[14 rows x 60 columns]
PCA Analysis
I started the analysis with selecting the number of components as the original column dimension, 60 (after dropping the timestamp column). I observed that component 14 covers the 95% of the variance. So, by this PCA analysis, 14 components may be enough to represent the data. Even with a higher threshold of 99%, 33 components are enough. Thus, it can be said that the dimension can be reduced to half or even quarter of the original with PCA.
In PCA two features having the same-sign contribution to the components means that these companies behave similarly, such that they get affected by the changes (in market in this case) similarly. This can indicate that such companies have a similar working field. Thus, the companies that have the most common same-sign in component loadings are examined pairwise. The first pair is EREGL (EREĞLİ DEMİR VE ÇELİK FABRİKALARI )and KRDMD ( KARDEMİR KARABÜK DEMİR ÇELİK SANAYİ VE TİCARET A.Ş.) with the 47 common same signs. Their working field are exactly the same, so, it is a predictable and meaningful result. Next, ANACM( Anadolu Cam Sanayii A.Ş) and SISE (Turkiye Sise ve Cam Fabrikalari AS) with the count of 41. Again, they have the same industry. Another pair with count of 41 is KCHOL and TUPRS, TÜPRAŞ is a member of the companies of Koç Holding, so they are integrated. Another pair with count of 41 are TKFEN and ASELS. TKFEN produces chemical fertilizers, ASELSAN produces defense electronics, but they are both industrial companies that works with mass productions. They can be affected similarly from industry related investments. AKBNK (AKBANK) and GARAN (GARANTI BBVA) also have a high number of same-sign count of 38 since they are both banks. There are many pairs that seems unrelated, these pairs may have high numbers of count only because of the changes of the overall economy of the country and the world.
APPENDIX: I used ChatGPT for help with writing the codes.
Question-I have a table called allticks_data, this table has columns 61 columns; first column is timestaps, and the rest is 60 companies's stock price values at this time. I want to plot the boxplot of each companies stock prices in Phyton. can you help me?
To create box plots for the stock prices of each of the 60 companies in your allticks_data DataFrame in Python, you can use the seaborn library in conjunction with matplotlib. Here's a step-by-step guide to help you accomplish this task:
Assuming that you have a DataFrame called allticks_data, where the first column is "timestamps" and the remaining columns represent the stock prices of different companies, you can create box plots as follows:
seaborn and matplotlib installed. If not, you can install them with:pip install seaborn matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming your DataFrame is named "allticks_data"
# Select all the columns containing stock prices (excluding the "timestamps" column)
stock_columns = allticks_data.columns[1:] # Exclude the first column (timestamps)
# Create a box plot for each stock price column
for company in stock_columns:
sns.set(style="whitegrid") # Set the style of the plot (customize as needed)
sns.boxplot(x=allticks_data[company])
plt.xlabel("Price")
plt.title(f"Box Plot of {company} Stock Price")
plt.show()
This code will generate individual box plots for each of the 60 companies' stock prices in your DataFrame. You can customize the style and appearance of the plots to your preferences.
Make sure to adjust the code accordingly if your DataFrame or column names are different from what's mentioned in the code.
how do i calculate the variance and standard deviation of each column (except the first column, i.e. timestamp) ChatGPT To calculate the variance and standard deviation for each column (excluding the first column, i.e., the "timestamps" column) in a DataFrame, you can use the pandas library. Here's how you can do it: python Copy code import pandas as pd
columns_to_analyze = allticks_data.columns[1:]
variances = allticks_data[columns_to_analyze].var() std_deviations = allticks_data[columns_to_analyze].std()
statistics_df = pd.DataFrame({ 'Column': columns_to_analyze, 'Variance': variances, 'Standard Deviation': std_deviations })
print(statistics_df)
This code will create a new DataFrame (statistics_df) that contains the column names, their variances, and standard deviations. You can modify the code to save the results to a file or perform further analysis as needed.
I want to plot the first 30 rows of columns named YATAS and BANVT in the same figure; i want seperate graphs for all the following 30 rows until the end of data ChatGPT To plot the first 30 rows of columns "YATAS" and "BANVT" in the same figure and then create separate graphs for each subsequent set of 30 rows until the end of the data, you can use the following code: python Copy code import matplotlib.pyplot as plt
timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 30
plt.figure(figsize=(12, 6))
for column in selected_columns: column_data = allticks_data[column][:30] plt.plot(timestamps[:30], column_data, label=column)
plt.xticks(rotation=45) # Rotate x-axis labels for readability
plt.xlabel("Timestamp") plt.ylabel("Price") plt.title("Stock Prices for the First 30 Rows")
plt.legend(loc="upper right")
plt.show()
for i in range(1, num_sets): plt.figure(figsize=(12, 6)) for column in selected_columns: start_row = i 30 end_row = (i + 1) 30 column_data = allticks_data[column][start_row:end_row] plt.plot(timestamps[start_row:end_row], column_data, label=column)
plt.xticks(rotation=45) # Rotate x-axis labels for readability
# Set labels and title for subsequent graphs
plt.xlabel("Timestamp")
plt.ylabel("Price")
plt.title(f"Stock Prices for Rows {start_row+1}-{end_row}")
# Add a legend to distinguish between columns
plt.legend(loc="upper right")
# Show the subsequent graphs
plt.show()
This code will first plot the first 30 rows of "YATAS" and "BANVT" in the same graph and then create separate graphs for each subsequent set of 30 rows until the end of the data, with each set of rows having its own graph. Adjust the figure size, labels, and other visual settings as needed.
i want seperate graphs for each 30 rows until the end of data of columns named YATAS and BANVT in the same figure; ChatGPT If you want separate graphs for each set of 30 rows for columns "YATAS" and "BANVT," all in the same figure, you can modify the code as follows: python Copy code import matplotlib.pyplot as plt
timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 30
plt.figure(figsize=(12, 6))
for i in range(num_sets): plt.figure(figsize=(12, 6)) for column in selected_columns: start_row = i 30 end_row = (i + 1) 30 column_data = allticks_data[column][start_row:end_row] plt.plot(timestamps[start_row:end_row], column_data, label=column)
plt.xticks(rotation=45) # Rotate x-axis labels for readability
# Set labels and title for each graph
plt.xlabel("Timestamp")
plt.ylabel("Price")
plt.title(f"Stock Prices for Rows {start_row+1}-{end_row}")
# Add a legend to distinguish between columns
plt.legend(loc="upper right")
# Show each graph
plt.show()
This code will create separate graphs for each set of 30 rows for columns "YATAS" and "BANVT" and display them all in the same figure. Each set of rows will have its own graph with separate x- and y-axis labels.
i want to see the correlations of those two columns for 30 row's period ChatGPT To calculate the correlation between two columns ("YATAS" and "BANVT") for a 30-row period in your DataFrame, you can use the pandas library. You can extract the data for the specific period and then calculate the correlation between the two columns. Here's an example of how to do it: python Copy code
timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 30
correlations = []
for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns correlations.append(correlation)
print(correlations)
In this code, we extract the data for each 30-row period, calculate the correlation between the "YATAS" and "BANVT" columns within that period using the .corr() method, and store the correlation values in the correlations list. You can choose to print, visualize, or analyze the correlation values as needed.
i also want to plot the correlation ChatGPT If you want to plot the correlations of "YATAS" and "BANVT" for each 30-row period, you can do so using a line plot. Here's how you can modify the code to achieve this: python Copy code import matplotlib.pyplot as plt
timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 30
correlations = [] x_labels = []
for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns correlations.append(correlation) x_labels.append(f"Rows {start_row + 1}-{end_row}")
plt.figure(figsize=(12, 6)) plt.plot(x_labels, correlations, marker='o', linestyle='-', color='b')
plt.xlabel("30-Row Periods") plt.ylabel("Correlation") plt.title("Correlation Between YATAS and BANVT for 30-Row Periods")
plt.xticks(rotation=45)
plt.show()
This code will create a line plot that shows the correlations between "YATAS" and "BANVT" for each 30-row period, and the x-axis labels represent the corresponding row periods. You can adjust the figure size, labels, and other visual settings as needed.
i want to have plots for each 30-row period ChatGPT If you want to create separate plots for each 30-row period and visualize the correlation between "YATAS" and "BANVT" for each period, you can modify the code as follows: python Copy code import matplotlib.pyplot as plt
timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 30
correlations = [] x_labels = []
for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns correlations.append(correlation) x_labels.append(f"Rows {start_row + 1}-{end_row}")
# Create a line plot for the correlation
plt.figure(figsize=(12, 6))
plt.plot(timestamps[start_row:end_row], period_data[selected_columns])
# Set labels and title for each plot
plt.xlabel("Timestamp")
plt.ylabel("Price")
plt.title(f"Stock Prices for Rows {start_row + 1}-{end_row}")
# Add a legend to distinguish between columns
plt.legend(selected_columns, loc="upper right")
# Show each plot
plt.show()
This code will create separate plots for each 30-row period, displaying the stock prices of "YATAS" and "BANVT" for each period, and calculating and displaying the correlation values in the respective plots. Each plot will have its own x-axis labels and titles. Adjust the figure size, labels, and other visual settings as needed.
can i calculate the correlation between those two columns for the first 30 rows ChatGPT Certainly, you can calculate the correlation between the "YATAS" and "BANVT" columns for the first 30 rows. Here's how you can do it: python Copy code
timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]
period_data = allticks_data[selected_columns][:30]
correlation = period_data.corr().iloc[0, 1]
print(f"Correlation between YATAS and BANVT for the first 30 rows: {correlation}")
This code will extract the data for the first 30 rows of "YATAS" and "BANVT," calculate the correlation between these two columns, and print the correlation value. You can use or analyze the correlation value as needed.
i want to see the correlations of those two columns for 30 row's period, i want a list of all correlations values but i want to see the start row and end row of each one in the list too. ChatGPT If you want to calculate the correlations between the "YATAS" and "BANVT" columns for each 30-row period and have a list that includes both the correlation values and the corresponding start and end row for each period, you can do so using the following code: python Copy code
timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 30
correlations_with_periods = []
for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns correlations_with_periods.append((start_row, end_row, correlation))
for start_row, end_row, correlation in correlations_with_periods: print(f"Correlation between YATAS and BANVT for Rows {start_row + 1}-{end_row}: {correlation}")
This code will calculate the correlations for each 30-row period and create a list correlations_with_periods that contains tuples of start row, end row, and correlation value for each period. It then displays the list with the corresponding information. You can further use or analyze this list as needed.
instead of start and end row numbers, i want to see the timestamp in the timestamp column that are in those rows in the list. ChatGPT If you want to see the timestamps from the "timestamps" column for the corresponding start and end rows within each 30-row period in the list, you can modify the code as follows: python Copy code
timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 30
correlations_with_periods = []
for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns start_timestamp = timestamps.iloc[start_row] # Get the start timestamp end_timestamp = timestamps.iloc[end_row - 1] # Get the end timestamp correlations_with_periods.append((start_timestamp, end_timestamp, correlation))
for start_timestamp, end_timestamp, correlation in correlations_with_periods: print(f"Correlation between YATAS and BANVT from {start_timestamp} to {end_timestamp}: {correlation}")
In this modified code, the start_timestamp and end_timestamp are extracted from the "timestamps" column for the corresponding rows in each 30-row period, and these timestamps are included in the list along with the correlation values. This provides you with the start and end timestamps for each period in the list.
i want to see the values in the column named "timestamp" ChatGPT If you want to see the values in the "timestamps" column, you can extract and print these values from your DataFrame. Here's how to do it: python Copy code
timestamps_values = allticks_data['timestamps'].tolist()
print(timestamps_values)
This code will extract the values from the "timestamps" column and store them in a list timestamps_values, and then print the list of timestamp values.
i want to see the first 100 columns in the column named timestamp ChatGPT If you want to view the first 100 values in the "timestamps" column, you can extract and print a subset of the values. Here's how to do it: python Copy code
timestamps_values = allticks_data['timestamps'].iloc[:100]
print(timestamps_values)
This code will extract the first 100 values from the "timestamps" column and print them. Adjust the number 100 in iloc[:100] if you want to view a different number of values.
I want to do the same for each 750 row. instead of start and end row numbers, i want to see the timestamp in the timestamp column that are in those rows in the list. I also want to have the plots of each 750 row, name the plots with the same name that period has on the list ChatGPT To create a list of correlations and corresponding timestamps for each 750-row period, and generate separate plots for each period while naming the plots with the period's timestamp values, you can use the following code: python Copy code import matplotlib.pyplot as plt import pandas as pd
timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 750
correlations_with_timestamps = []
for i in range(num_sets): start_row = i 750 end_row = (i + 1) 750 period_data = allticks_data[selected_columns][start_row:end_row]
# Calculate the correlation between the two columns
correlation = period_data.corr().iloc[0, 1]
# Get the start and end timestamps
start_timestamp = timestamps.iloc[start_row]
end_timestamp = timestamps.iloc[end_row - 1]
# Add the correlation and timestamp information to the list
correlations_with_timestamps.append((start_timestamp, end_timestamp, correlation))
# Create a line plot for the period and save it with the period's timestamps as the name
plt.figure(figsize=(12, 6))
plt.plot(timestamps[start_row:end_row], period_data)
# Set labels and title for each plot
plt.xlabel("Timestamp")
plt.ylabel("Price")
plt.title(f"Stock Prices for {start_timestamp} to {end_timestamp}")
# Save the plot with the name based on timestamps
plt.savefig(f"{start_timestamp}_{end_timestamp}.png")
plt.close()
for start_timestamp, end_timestamp, correlation in correlations_with_timestamps: print(f"Correlation between YATAS and BANVT from {start_timestamp} to {end_timestamp}: {correlation}")
In this code, correlations are calculated for each 750-row period, and for each period, a line plot is created and saved with a name based on the period's timestamps. The list correlations_with_timestamps contains correlation values and timestamp information for each period, which is then printed. You can adjust the figure size, labels, and other visual settings as needed.
name the figures with row numbers instead ChatGPT To name the figures with row numbers instead of timestamps, you can modify the code as follows: python Copy code import matplotlib.pyplot as plt import pandas as pd
selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 750
correlations = []
for i in range(num_sets): start_row = i 750 end_row = (i + 1) 750 period_data = allticks_data[selected_columns][start_row:end_row]
# Calculate the correlation between the two columns
correlation = period_data.corr().iloc[0, 1]
correlations.append(correlation)
# Create a line plot for the period
plt.figure(figsize=(12, 6))
plt.plot(period_data)
# Set labels and title for each plot
plt.xlabel("Row Number")
plt.ylabel("Price")
plt.title(f"Stock Prices for Rows {start_row+1}-{end_row}")
# Save the plot with the name based on row numbers
plt.savefig(f"Row_{start_row+1}_{end_row}.png")
plt.close()
for i, correlation in enumerate(correlations): start_row = i 750 end_row = (i + 1) 750 print(f"Correlation between YATAS and BANVT for Rows {start_row+1}-{end_row}: {correlation}")
In this modified code, the figures are named with the corresponding row numbers for each 750-row period. The list correlations contains correlation values, and the name of each figure is based on the row numbers.
i want to plot correlation matrix between these two column for each 750 rows ChatGPT To plot a correlation matrix between the "YATAS" and "BANVT" columns for each 750-row period, you can use the following code: python Copy code import matplotlib.pyplot as plt import pandas as pd
selected_columns = ["YATAS", "BANVT"]
total_rows = len(allticks_data) num_sets = total_rows // 750
correlation_matrices = []
for i in range(num_sets): start_row = i 750 end_row = (i + 1) 750 period_data = allticks_data[selected_columns][start_row:end_row]
# Calculate the correlation matrix between the two columns
correlation_matrix = period_data.corr()
correlation_matrices.append(correlation_matrix)
# Create a heatmap of the correlation matrix
plt.figure(figsize=(8, 6))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none', aspect='auto')
plt.colorbar()
plt.title(f"Correlation Matrix for Rows {start_row+1}-{end_row}")
# Set x and y-axis labels
plt.xticks(range(len(selected_columns), selected_columns))
plt.yticks(range(len(selected_columns), selected_columns))
# Show the plot
plt.show()
for i, matrix in enumerate(correlation_matrices): start_row = i 750 end_row = (i + 1) 750 print(f"Correlation Matrix for Rows {start_row+1}-{end_row}:\n{matrix}")
In this code, we calculate the correlation matrix for the "YATAS" and "BANVT" columns for each 750-row period and create a heatmap plot of the correlation matrix. The list correlation_matrices stores the correlation matrices for each period, and they are displayed using plt.show(). You can adjust the figure size, color map, labels, and other visual settings as needed.
i want to have a scatter plot of these two columns and use poly1d () function to fit a line between these data and to see if the correlation is positive or negative. do this for the first 750 columns ChatGPT To create a scatter plot of the "YATAS" and "BANVT" columns for the first 750 rows and fit a line to the data using poly1d() to determine if the correlation is positive or negative, you can use the following code: python Copy code import matplotlib.pyplot as plt import numpy as np from numpy.polynomial.polynomial import polyfit
selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]
x = period_data["YATAS"] y = period_data["BANVT"]
plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")
b, m = polyfit(x, y, 1) fit_line = np.poly1d([b, m])
plt.plot(x, fit_line(x), color="red", label=f"Fitted Line (y = {m:.2f}x + {b:.2f})")
plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot and Fitted Line for the First 750 Rows")
correlation_sign = "positive" if m > 0 else "negative"
plt.legend(loc="upper right") plt.show()
print(f"Correlation is {correlation_sign}")
This code selects the data for the first 750 rows, creates a scatter plot of "YATAS" and "BANVT," fits a line to the data using polyfit(), and determines whether the correlation is positive or negative based on the slope of the fitted line.
it gave the error "SVD did not converge in Linear Least Squares". how else i can visualize the correlation ChatGPT If you encountered the "SVD did not converge in Linear Least Squares" error while trying to fit a line using polyfit, it indicates that the linear least squares fitting method had difficulties converging, possibly due to the nature of the data. In such cases, you can still visualize the correlation between the two variables using alternative methods. One common approach is to use a scatter plot and calculate the correlation coefficient. Here's how you can do that: python Copy code import matplotlib.pyplot as plt import numpy as np
selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]
x = period_data["YATAS"] y = period_data["BANVT"]
plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")
correlation_coefficient = np.corrcoef(x, y)[0, 1]
plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot for the First 750 Rows")
plt.text(0.2, 0.9, f"Correlation: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.legend(loc="upper right") plt.show()
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
print(f"Correlation is {correlation_sign}")
This code creates a scatter plot and calculates the correlation coefficient using np.corrcoef(). It also displays the correlation coefficient value on the plot. The sign of the correlation can be determined based on the coefficient's sign, just as before. This approach provides a visual representation of the correlation without the need for fitting a line.
can you also print the correlation ChatGPT Certainly! I've updated the code to print the correlation coefficient along with the visualization: python Copy code import matplotlib.pyplot as plt import numpy as np
selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]
x = period_data["YATAS"] y = period_data["BANVT"]
correlation_coefficient = np.corrcoef(x, y)[0, 1]
plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")
plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot for the First 750 Rows")
plt.text(0.2, 0.9, f"Correlation: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.legend(loc="upper right") plt.show()
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
print(f"Correlation is {correlation_coefficient:.2f}, which is {correlation_sign}")
This code calculates the correlation coefficient and prints it along with the sign of the correlation, and the correlation coefficient is also displayed on the scatter plot.
i want to see the correlation, not the correlation coefficient ChatGPT I see your request. If you want to see the correlation as the visual representation, you can use a scatter plot matrix to visualize the correlation between variables. You can use Seaborn to create a scatter plot matrix. Here's how you can do that: python Copy code import seaborn as sns import pandas as pd
selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]
df = pd.DataFrame(data=period_data, columns=selected_columns)
sns.pairplot(df, kind="scatter")
plt.show()
This code will create a scatter plot matrix for the selected columns, and you can observe the correlations between the variables in the visual representation. Seaborn will also provide you with scatterplots for each combination of variables in the selected columns, allowing you to see the relationships and correlations between them.
in the code you wrote for correlation coefficient, i also want to calculate the correlation itself and print it too ChatGPT Certainly! If you want to calculate and print the correlation itself, along with the correlation coefficient, you can do the following: python Copy code import matplotlib.pyplot as plt import numpy as np
selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]
x = period_data["YATAS"] y = period_data["BANVT"]
correlation_coefficient = np.corrcoef(x, y)[0, 1] correlation = np.corrcoef(x, y)[0, 1] # Calculate the correlation itself
plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")
plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot for the First 750 Rows")
plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12) plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.legend(loc="upper right") plt.show()
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
print(f"Correlation is {correlation:.2f}, which is {correlation_sign}")
This code calculates both the correlation coefficient and the correlation itself, and prints them along with the sign of the correlation. It also displays the correlation coefficient and correlation on the scatter plot.
when i run my code it says that correlation: nan. why is that the case? ChatGPT If you're encountering "correlation: nan" in your output, it suggests that the calculated correlation coefficient is resulting in "NaN" (Not-a-Number) value. This typically occurs when one or both of the variables being used for correlation contain missing or undefined values (i.e., NaN) in the dataset. To address this issue, you can check and handle missing or undefined values in your data before calculating the correlation. Here are a few steps you can take: Check for missing values: Use pd.isna() or pd.isnull() to check for missing values in your dataset, specifically in the "YATAS" and "BANVT" columns. python Copy code missing_values = period_data.isna().any() print("Missing Values:", missing_values)
Handle missing values: If missing values are present, you can choose to either remove the rows with missing values or impute the missing values with an appropriate strategy, such as mean, median, or interpolation. For example, you can remove rows with missing values as follows: python Copy code period_data = period_data.dropna()
Or you can impute missing values with the mean: python Copy code period_data.fillna(period_data.mean(), inplace=True)
After handling missing values, try calculating the correlation again. This should resolve the issue of getting "NaN" as the correlation result. If you're still encountering the issue after checking and handling missing values, there might be other factors at play, such as unusual data characteristics or a mistake in data preprocessing. In such cases, a more detailed examination of the data and the code may be necessary to identify the specific cause of the issue.
can i write this code for it to calculate without missing values? ChatGPT Certainly! You can modify the code to calculate the correlation without missing values by removing any rows that contain missing values in the dataset. Here's how you can do that: python Copy code import matplotlib.pyplot as plt import numpy as np
selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]
period_data.dropna(inplace=True)
x = period_data["YATAS"] y = period_data["BANVT"]
correlation_coefficient = np.corrcoef(x, y)[0, 1] correlation = np.corrcoef(x, y)[0, 1] # Calculate the correlation itself
plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")
plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot for the First 750 Rows")
plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12) plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.legend(loc="upper right") plt.show()
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
print(f"Correlation is {correlation:.2f}, which is {correlation_sign}")
In this code, the dropna() method is used to remove rows with missing values, and then the correlation calculation is performed on the cleaned dataset. This ensures that the calculation is done without missing values, and you won't encounter "NaN" as the correlation result.
can you repeat this code for each 750 rows. i want to have separeter plots for each. and i want to have a list of all correlation values at the end. in the list, i also want to see the corresponding timestamp to those rows, i.e. the values in the "timestamp" column that are at the start and end rows ChatGPT To calculate and visualize correlations for each 750-row period, create separate plots for each period, and generate a list of correlation values along with the corresponding timestamps, you can use the following code: python Copy code import matplotlib.pyplot as plt import numpy as np
selected_columns = ["YATAS", "BANVT"]
correlations_with_timestamps = []
num_sets = len(allticks_data) // 750
for i in range(num_sets): start_row = i 750 end_row = (i + 1) 750 period_data = allticks_data[selected_columns][start_row:end_row]
# Drop rows with missing values
period_data.dropna(inplace=True)
# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]
# Calculate the correlation coefficient and correlation
correlation_coefficient = np.corrcoef(x, y)[0, 1]
correlation = np.corrcoef(x, y)[0, 1] # Calculate the correlation itself
# Get the start and end timestamps
start_timestamp = allticks_data["timestamps"].iloc[start_row]
end_timestamp = allticks_data["timestamps"].iloc[end_row - 1]
# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")
# Set labels and title
plt.xlabel("YATAS")
plt.ylabel("BANVT")
plt.title(f"Scatter Plot for Rows {start_row+1}-{end_row}")
# Add text to display the correlation coefficient and correlation
plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
# Display the plot
plt.legend(loc="upper right")
plt.show()
# Determine the correlation sign based on the correlation coefficient
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
# Add the correlation and timestamps to the list
correlations_with_timestamps.append({
"Correlation": correlation,
"Correlation Sign": correlation_sign,
"Start Timestamp": start_timestamp,
"End Timestamp": end_timestamp
})
for item in correlations_with_timestamps: print(f"Correlation: {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}, Start Timestamp: {item['Start Timestamp']}, End Timestamp: {item['End Timestamp']}")
This code calculates correlations, creates scatter plots, and stores the correlation values along with the corresponding timestamps in the list correlations_with_timestamps. Each element in the list contains the correlation value, sign, and the start and end timestamps for each 750-row period.
can i add up all these correlation values ChatGPT If you want to calculate the sum of all the correlation values from the list correlations_with_timestamps, you can do so by iterating through the list and summing the correlation values. Here's how you can calculate the sum of all correlation values: python Copy code
total_correlation_sum = sum(item["Correlation"] for item in correlations_with_timestamps)
print(f"Sum of all correlation values: {total_correlation_sum:.2f}")
You can add this code at the end of the previous code I provided. It calculates the sum of all correlation values and prints the result. This will give you the cumulative correlation value for all the 750-row periods.
i want to plot these two columns between these row periods seperately " 5251-6000 10501-11250 12751-13500 17251-18000 23251-24000 24000-24750 28750-29250 31501-32250 36001-36750 38251-39000 39001-39750 42751-43500 44251-45000 47251-48000 48001-48750" ChatGPT To create separate scatter plots for the "YATAS" and "BANVT" columns for the specified row periods and calculate correlations for each of these periods, you can use the following code: python Copy code import matplotlib.pyplot as plt import numpy as np
row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]
correlation_values = []
for start_row, end_row in row_periods: period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing period_data.dropna(inplace=True)
# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]
# Calculate the correlation coefficient and correlation
correlation_coefficient = np.corrcoef(x, y)[0, 1]
correlation = np.corrcoef(x, y)[0, 1]
# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")
# Set labels and title
plt.xlabel("YATAS")
plt.ylabel("BANVT")
plt.title(f"Scatter Plot for Rows {start_row}-{end_row}")
# Add text to display the correlation coefficient and correlation
plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
# Display the plot
plt.legend(loc="upper right")
plt.show()
# Determine the correlation sign based on the correlation coefficient
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
# Append the correlation value to the list
correlation_values.append({
"Start Row": start_row,
"End Row": end_row,
"Correlation": correlation,
"Correlation Sign": correlation_sign
})
for item in correlation_values: print(f"Rows {item['Start Row']}-{item['End Row']}: Correlation {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}")
This code creates scatter plots and calculates correlations for each of the specified row periods, and it also prints the list of correlation values with the corresponding row period details.
i only want to plot the values in those rows ChatGPT If you want to create scatter plots for only the specified row periods and not include rows outside those periods, you can modify the code as follows: python Copy code import matplotlib.pyplot as plt import numpy as np
row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]
correlation_values = []
for start_row, end_row in row_periods: period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing period_data.dropna(inplace=True)
# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]
# Calculate the correlation coefficient and correlation
correlation_coefficient = np.corrcoef(x, y)[0, 1]
correlation = np.corrcoef(x, y)[0, 1]
# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")
# Set labels and title
plt.xlabel("YATAS")
plt.ylabel("BANVT")
plt.title(f"Scatter Plot for Rows {start_row}-{end_row}")
# Add text to display the correlation
i only want to see the values of those columns at those rows. i dont want to see the correlations ChatGPT If you want to create scatter plots for only the specified row periods without displaying correlation values, you can modify the code to focus on plotting the data only, without calculating and displaying the correlation. Here's how to do that: python Copy code import matplotlib.pyplot as plt
row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]
for start_row, end_row in row_periods: period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing
# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]
# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")
# Set labels and title
plt.xlabel("YATAS")
plt.ylabel("BANVT")
plt.title(f"Scatter Plot for Rows {start_row}-{end_row}")
# Display the plot
plt.legend(loc="upper right")
plt.show()
This code will create separate scatter plots for the specified row periods, focusing on displaying the data without calculating or displaying correlation values.
i want to see these rows of "YATAS" and "BANVT" in that list plotted in a graph. ChatGPT To display the rows of "YATAS" and "BANVT" in the specified row periods as separate graphs, you can create subplots for each row period and plot the data. Here's how you can do it: python Copy code import matplotlib.pyplot as plt
row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]
for i, (start_row, end_row) in enumerate(row_periods, start=1): period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing
# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]
# Create a new subplot for each row period
plt.subplot(5, 3, i) # 5 rows, 3 columns of subplots
# Plot the data
plt.plot(x, label="YATAS", color="blue")
plt.plot(y, label="BANVT", color="red")
# Set labels and title
plt.xlabel("Row")
plt.ylabel("Value")
plt.title(f"Rows {start_row}-{end_row}")
# Add a legend
plt.legend()
plt.tight_layout()
plt.show()
This code creates separate graphs (subplots) for each row period, displaying the "YATAS" and "BANVT" values in those rows. You can adjust the number of rows and columns of subplots to fit your preferred layout. In this example, there are 5 rows and 3 columns of subplots.
i dont want to see them as subplots. i want to see them as seperate plots ChatGPT If you want to see each row period as a separate individual plot, not as subplots, you can modify the code to create separate plots for each row period. Here's how to do that: python Copy code import matplotlib.pyplot as plt
row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]
for start_row, end_row in row_periods: period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing
# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]
# Create a new figure for each row period
plt.figure(figsize=(8, 6))
# Plot the data
plt.plot(x, label="YATAS", color="blue")
plt.plot(y, label="BANVT", color="red")
# Set labels and title
plt.xlabel("Row")
plt.ylabel("Value")
plt.title(f"Rows {start_row}-{end_row}")
# Add a legend
plt.legend()
# Show the individual plot
plt.show()
This code creates separate individual plots for each row period, displaying the "YATAS" and "BANVT" values in those rows without using subplots.
as i said, in this data set; there are 50011 rows and 61 columns. The first column consist of timestamps, other 60 columns each belongs to a different company's stock prices at that time stamp. How can i apply Principal Component Analysis to this data set ChatGPT Principal Component Analysis (PCA) is a dimensionality reduction technique commonly used for data with multiple features or dimensions. In your case, you have a dataset with 50011 rows and 61 columns, where the first column contains timestamps and the remaining 60 columns represent different company's stock prices at those timestamps. You can apply PCA to this dataset as follows: Data Preprocessing: Remove the timestamp column, as PCA operates on numerical data, not timestamps. Handle any missing data by imputing missing values or removing rows with missing data, as PCA doesn't work well with missing values. Standardization (Optional but Recommended): Standardize the data by subtracting the mean and dividing by the standard deviation for each of the 60 columns containing stock prices. Standardization is recommended for PCA to give equal weight to all features. Apply PCA: Use a library like scikit-learn in Python to apply PCA. Here's an example of how to do it: python Copy code from sklearn.decomposition import PCA
stock_data = allticks_data.drop(columns=['timestamp'])
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() stock_data_standardized = scaler.fit_transform(stock_data)
n_components = 60 # Number of principal components to retain pca = PCA(n_components=n_components) pca.fit(stock_data_standardized) # Use the standardized data if you performed standardization
principalcomponents = pca.components
explained_variance_ratio = pca.explained_varianceratio
The n_components parameter in PCA allows you to specify how many principal components you want to retain. You can choose the number based on the variance you want to explain. explained_variance_ratio will tell you the proportion of variance explained by each principal component. Visualization (Optional): You can visualize the explained variance to decide how many principal components to retain. A scree plot or a cumulative explained variance plot can be helpful for this purpose. Transform Data: You can transform your data using the selected number of principal components if you want to use the reduced-dimensional representation for further analysis. Interpret Results: You can analyze the principal components to understand which features (stocks) contribute most to each component. Remember that the number of principal components to retain is a crucial decision and should be based on the amount of variance you want to capture while reducing the dimensionality of your data. You might want to experiment with different values of n_components and see how much variance is explained to make an informed decision.
can i fill in the missing rows with the average of the existing nearest upper and below values? can you add this to the code before applying the pca ChatGPT Certainly, you can fill in missing values in a DataFrame by taking the average of the nearest upper and lower values. You can use the fillna() method in pandas to accomplish this. Here's how you can update your code to fill in missing values with the average of the nearest neighbors before applying PCA: python Copy code from sklearn.decomposition import PCA import pandas as pd
stock_data = allticks_data.drop(columns=['timestamp'])
stock_data_filled = stock_data.interpolate(method='linear', axis=0)
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() stock_data_standardized = scaler.fit_transform(stock_data_filled)
n_components = 60 # Number of principal components to retain pca = PCA(n_components=n_components) pca.fit(stock_data_standardized) # Use the standardized data if you performed standardization
principalcomponents = pca.components
explained_variance_ratio = pca.explained_varianceratio
In this code, we use the interpolate method with the linear interpolation method to fill in missing values in each column. This method linearly interpolates the missing values based on the values of their nearest upper and lower neighbors in the column. After filling in the missing values, you can proceed with PCA as before.
it still gives the error "ValueError: Input X contains NaN. PCA does not accept missing values encoded as NaN natively." ChatGPT I apologize for the confusion. You're right; PCA in scikit-learn does not handle missing values natively, and interpolation using the interpolate method may not be sufficient. In this case, you need to handle the missing values more explicitly. One common approach is to impute missing values with the mean of the respective column. Here's how you can do it: python Copy code from sklearn.decomposition import PCA import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler
stock_data = allticks_data.drop(columns=['timestamp'])
imputer = SimpleImputer(strategy='mean') stock_data_imputed = imputer.fit_transform(stock_data)
scaler = StandardScaler() stock_data_standardized = scaler.fit_transform(stock_data_imputed)
n_components = 60 # Number of principal components to retain pca = PCA(n_components=n_components) pca.fit(stock_data_standardized)
principalcomponents = pca.components
explained_variance_ratio = pca.explained_varianceratio
In this code: We use the SimpleImputer from scikit-learn to impute missing values with the column mean. This is a common method to handle missing data before applying PCA. After imputing missing values, we standardize the data with StandardScaler. Then, we proceed to perform PCA as previously described. This approach ensures that missing values are handled before PCA, and it should resolve the "ValueError: Input X contains NaN" issue.
how can i interpret the results of explained_variance_ratio array([4.88093861e-01, 1.70283496e-01, 1.07268396e-01, 4.98911574e-02, 3.80809652e-02, 2.14055156e-02, 1.53236261e-02, 1.32797651e-02, 1.02353851e-02, 8.87784475e-03, 8.34348500e-03, 7.10699121e-03, 6.22456829e-03, 5.94513977e-03, 4.79582514e-03, 4.54884774e-03, 3.47738719e-03, 3.08820485e-03, 2.63245555e-03, 2.61131197e-03, 2.22285259e-03, 2.06089345e-03, 1.81811064e-03, 1.71262120e-03, 1.62892002e-03, 1.36144582e-03, 1.28810777e-03, 1.18659269e-03, 1.12189254e-03, 1.06948368e-03, 9.66592430e-04, 9.34899694e-04, 8.18675874e-04, 7.97092634e-04, 7.39834592e-04, 6.91087590e-04, 6.55839655e-04, 6.18756397e-04, 6.07880344e-04, 5.53020787e-04, 4.75100471e-04, 4.63767517e-04, 4.27597596e-04, 4.22311765e-04, 3.99869410e-04, 3.67213385e-04, 3.53299276e-04, 3.32230281e-04, 3.12833764e-04, 2.81917451e-04, 2.53341253e-04, 2.24278065e-04, 2.18195177e-04, 2.09072241e-04, 1.82989643e-04, 1.64178884e-04, 1.52198743e-04, 1.45553903e-04, 1.42375238e-04, 1.02846619e-04]) ChatGPT The explained_variance_ratio array returned by PCA represents the proportion of the total variance in the original dataset that is explained by each principal component. Each value in this array indicates the ratio of variance explained by a particular principal component. In your specific example, you have an array with 60 values, and it represents the variance explained by each of the 60 principal components. To interpret these values, you can consider the following: Variance Explained by Each Component: The first value in the array (0.488) indicates that the first principal component explains approximately 48.8% of the total variance in your data. The second value (0.170) indicates that the second principal component explains approximately 17.0% of the variance, and so on. Cumulative Variance Explained: You can also calculate the cumulative explained variance by summing the values in the array. For example, if you sum the first two values, you get 0.488 + 0.170 = 0.658, which means that the first two principal components explain approximately 65.8% of the total variance in your data. Dimensionality Reduction: These values help you decide how many principal components to retain for dimensionality reduction. If you want to retain a certain percentage of the total variance (e.g., 95%), you can sum the values in the explained_variance_ratio array until you reach or exceed that threshold. The corresponding number of principal components will give you the reduced dimensionality. Interpretation of Principal Components: The values in the array also give you an idea of which principal components capture the most variance. Components with higher values explain more variance and are more important in representing the data. You can also examine the loadings of the original features on these principal components to interpret the relationship between the original features and the principal components. In summary, the explained_variance_ratio array provides valuable information about how well each principal component represents the variance in your data and helps you make decisions about dimensionality reduction and feature importance.
does n_components have to be equal to the column numbers ChatGPT No, the n_components parameter in PCA does not have to be equal to the number of columns (features) in your dataset. In fact, one of the main purposes of PCA is dimensionality reduction, which often involves reducing the number of components (dimensions) to a smaller value than the original number of features. You can choose the value of n_components based on your specific goals, such as: Preserving Variance: You can select n_components to preserve a certain percentage of the total variance in your data. For example, you might choose to retain enough components to capture 95% of the variance. Dimensionality Reduction: If your dataset has a large number of features, you can reduce dimensionality by selecting a smaller value for n_components. This can help mitigate the curse of dimensionality and improve the efficiency of subsequent analysis. Feature Selection: PCA can also be used for feature selection. You might choose a value of n_components that corresponds to the most important components and, therefore, the most important original features. Visualization: In some cases, you might select n_components to facilitate visualization. For example, you might choose two or three components to create 2D or 3D plots. The choice of n_components should be based on the specific objectives of your analysis. It's not required to be equal to the number of columns, and in many cases, it's desirable to select a smaller value to achieve one of the objectives mentioned above.
what does pca do with this data set? i dont understand the benefit of doing this? can you explain ChatGPT Principal Component Analysis (PCA) is a dimensionality reduction technique that can be applied to a dataset like the one you described, which has multiple features (in your case, 60 columns representing stock prices). PCA serves several purposes and provides several benefits: Dimensionality Reduction: One of the main goals of PCA is to reduce the dimensionality of the data. In your dataset, you have 60 columns (features), and it might be challenging to work with such high-dimensional data. PCA allows you to transform the data into a new set of variables (principal components) that capture the most important information while reducing the number of dimensions. This can make subsequent analysis more manageable and computationally efficient. Decorrelation: PCA creates new variables (principal components) that are linear combinations of the original features. These new variables are designed to be uncorrelated with each other. This can help simplify the relationships between variables and reduce multicollinearity, which can be beneficial in various statistical and machine learning tasks. Variance Capture: PCA arranges the principal components in descending order of the variance they explain. The first principal component explains the most variance, and subsequent components explain less. By selecting a subset of these components, you can retain the most important information in your data while reducing noise. Data Compression: PCA can be used for data compression. The reduced-dimensional representation of your data (using fewer principal components) can save storage space and speed up data processing. Visualization: PCA can help with data visualization. By projecting data points onto a lower-dimensional space (e.g., 2D or 3D), you can create scatter plots or other visualizations that allow you to explore and understand your data more easily. Feature Selection: PCA can assist in feature selection. You can identify which original features contribute most to the principal components with the highest variance. This information can guide feature selection in machine learning or statistical modeling. Noise Reduction: By focusing on the principal components with the highest variance and ignoring those with low variance, you effectively filter out noise and outliers in your data. Improved Model Performance: In some cases, reducing the dimensionality of your data through PCA can lead to improved model performance in machine learning tasks, as models may become less prone to overfitting. Overall, PCA is a valuable technique when you have high-dimensional data, and it can serve various purposes, depending on your specific goals. It can help you gain insights into your data, simplify complex datasets, and prepare the data for downstream analysis or modeling.
can i see the components in this pca ChatGPT Yes, you can certainly examine the principal components obtained from PCA. The principal components are linear combinations of the original features that capture the most significant variance in the data. You can view these components to gain insights into the relationships between the original features and how they contribute to each component. Here's how you can do it: python Copy code
principalcomponents = pca.components
components_df = pd.DataFrame(principal_components, columns=stock_data.columns)
print(components_df)
This code will create a DataFrame called components_df where each row corresponds to a principal component, and each column corresponds to an original feature. Each cell in the DataFrame represents the weight of the corresponding feature in the respective principal component. These weights indicate how strongly each feature contributes to the component. By examining the components, you can gain insights into which original features are most influential in each principal component and how the components relate to the underlying structure of your data. The first few principal components often capture the most significant patterns, so you may focus on them for interpretation.
can i have a table related to importance of components (standard deviation, proportion of variance, cumulative proportion) and another table for loadings (columns and components against them) ChatGPT Certainly, you can create tables to display the importance of components, including standard deviation, proportion of variance, and cumulative proportion. Additionally, you can create a table to show the loadings, which represent the relationship between the original columns and the principal components. Here's how you can do it: python Copy code
component_stats = pd.DataFrame({ 'Component': range(1, len(explained_variance_ratio) + 1), 'Standard Deviation': np.sqrt(pca.explainedvariance), 'Proportion of Variance': pca.explained_varianceratio, 'Cumulative Proportion': np.cumsum(pca.explained_varianceratio) })
print("Component Statistics:") print(component_stats)
loadings = pd.DataFrame(pca.components_, columns=stock_data.columns)
print("\nLoadings Table:") print(loadings)
In this code: component_stats is a DataFrame that provides information about each component, including its standard deviation, proportion of variance explained, and cumulative proportion of variance explained. loadings is a DataFrame that shows the loadings, which represent the relationship between the original columns and the principal components. Each row in the loadings table corresponds to a principal component, and each column represents an original feature. The values in the table indicate how strongly each feature contributes to each component. By examining these tables, you can better understand the importance of each component, the proportion of variance explained by each component, and the relationships between the original features and the principal components. This information can be valuable for interpretation and decision-making in your analysis.
can you modify the code you wrote for PCA and add this part ChatGPT Certainly! Here's the modified code that includes the creation of tables for component statistics and loadings: python Copy code from sklearn.decomposition import PCA import pandas as pd import numpy as np
stock_data = allticks_data.drop(columns=['timestamp'])
imputer = SimpleImputer(strategy='mean') stock_data_imputed = imputer.fit_transform(stock_data)
scaler = StandardScaler() stock_data_standardized = scaler.fit_transform(stock_data_imputed)
n_components = 60 # Number of principal components to retain pca = PCA(n_components=n_components) pca.fit(stock_data_standardized)
explained_variance_ratio = pca.explained_varianceratio
component_stats = pd.DataFrame({ 'Component': range(1, len(explained_variance_ratio) + 1), 'Standard Deviation': np.sqrt(pca.explainedvariance), 'Proportion of Variance': pca.explained_varianceratio, 'Cumulative Proportion': np.cumsum(pca.explained_varianceratio) })
print("Component Statistics:") print(component_stats)
loadings = pd.DataFrame(pca.components_, columns=stock_data.columns)
print("\nLoadings Table:") print(loadings)
With this code, you'll perform PCA, calculate the explained variance ratio, and then create tables for component statistics and loadings, as requested. The component statistics table provides insights into the importance of each component, while the loadings table shows the relationship between the original features and the principal components.
how can i interpret the component statistics table? ChatGPT Interpreting the component statistics table generated from PCA is important for understanding the significance of each principal component and deciding how many components to retain. Here's a breakdown of the key statistics in the table: Component Number: This is a simple index that identifies each principal component. The first component is the most important, the second is the second most important, and so on. Standard Deviation: The standard deviation of each principal component measures the spread or variability of data points along that component's direction. Larger standard deviations indicate more spread, suggesting that the component captures more variance in the data. You can use the standard deviation to gauge the scale of each component's importance. Proportion of Variance: This value represents the proportion of the total variance in the original dataset explained by each principal component. It is a critical statistic in PCA. Larger values indicate that the component captures a larger portion of the dataset's variance. For instance, if the first component's proportion of variance is 0.488, it means that the first component explains approximately 48.8% of the total variance in your data. Cumulative Proportion: This is the cumulative sum of the proportions of variance. It shows the cumulative explained variance as you move through the components. You can use this to determine how many components are needed to capture a certain percentage of the total variance. For example, if the cumulative proportion exceeds 0.95, it means that the selected components together explain over 95% of the total variance. Interpretation of the component statistics table often involves looking at the proportion of variance and cumulative proportion. Here's how to interpret these statistics: Proportion of Variance: Larger values indicate more important components. If you are interested in dimensionality reduction or capturing the most significant patterns in your data, you should focus on components with higher proportions of variance. Cumulative Proportion: This column helps you decide how many components to retain. It indicates the fraction of the total variance you're capturing as you add components in order. If you want to retain a certain percentage of the total variance (e.g., 95% or 99%), look for the point in the cumulative proportion column where it exceeds that threshold. The number of components at that point is how many you might consider retaining. In summary, the component statistics table helps you identify the most important components and decide how many components you need to retain to capture a desired percentage of the total variance in your data. It guides the dimensionality reduction process and provides insights into the underlying structure of your dataset.
how can i interpret the loadings table? ChatGPT Interpreting the loadings table generated from PCA is essential for understanding how the original features (columns) contribute to each principal component. The loadings table represents the relationships between the original features and the principal components. Here's how to interpret the loadings: Principal Components (Rows): Each row in the loadings table corresponds to a principal component. The first row represents the loadings for the first principal component, the second row for the second component, and so on. The values in each row indicate how strongly each original feature contributes to that specific component. Original Features (Columns): Each column in the loadings table represents an original feature from your dataset. The values in each column indicate the relationship between that feature and the principal components. The larger the absolute value of a loading in a column, the more that feature contributes to the principal component. Positive and Negative Loadings: The signs (positive or negative) of the loadings are important. A positive loading indicates a positive correlation between the original feature and the principal component. In contrast, a negative loading indicates a negative correlation. Large positive or negative loadings imply that a feature strongly influences the corresponding component. Strength of Loadings: The magnitude (absolute value) of the loading indicates the strength of the relationship. Larger absolute values signify stronger associations between a feature and a component. If the loading is close to 1 (positive or negative), it suggests that the feature has a strong influence on the component. Zero Loadings: Loadings close to zero suggest that the original feature has minimal impact on the corresponding principal component. Features with near-zero loadings are not contributing significantly to that component. To interpret the loadings table: Pattern Recognition: Examine the loadings in each component to identify which original features are strongly associated with that component. High positive or negative loadings indicate that the corresponding feature plays a significant role in defining that component. Feature Importance: Features with the largest absolute loadings in a component are the most important for that component. You can identify which features have the most influence on the overall variation captured by each component. Correlation Direction: Pay attention to the signs of the loadings. Positive loadings indicate a positive correlation between the feature and the component, while negative loadings indicate a negative correlation. Dimension Interpretation: If your principal components correspond to real-world concepts or patterns (e.g., in a stock market dataset, a component might represent a particular industry or sector), you can interpret the loadings to understand which stocks are positively or negatively associated with that component. In summary, the loadings table provides insights into the relationships between original features and principal components, helping you understand how each component captures the variation in your data and which features are most influential in defining those components.
can i see, for each component, which company's have the biggest contribute? ChatGPT Yes, you can identify which original features (in your case, companies) have the largest contributions to each principal component. This information is obtained from the loadings in the loadings table. To determine which companies have the biggest contributions to each component, you can look at the absolute values of the loadings and identify the features with the highest absolute loadings for each component. Here's how you can do it: python Copy code
n_components = 5 # Choose the number of components you want to analyze
top_contributors = {}
for component in range(n_components):
# Get the loadings for the current component
component_loadings = loadings.iloc[component, :]
# Sort the companies by the absolute loadings (contribution)
sorted_contributors = component_loadings.abs().sort_values(ascending=False)
# Store the top contributing companies for the current component
top_contributors[f'Component {component + 1}'] = sorted_contributors.index[0] # The top contributor
for component, top_contributor in top_contributors.items(): print(f"{component}: {top_contributor}")
In this code, we select a certain number of components to analyze (you can change n_components to the desired number). For each component, we extract the loadings, sort them by absolute value in descending order, and store the top contributor (i.e., the company with the largest absolute loading) for that component. The output will show you which company contributes the most to each of the selected components. This information helps you understand which companies are most influential in defining each principal component in your PCA analysis.
"Positive loadings indicate a positive correlation between the feature and the component, while negative loadings indicate a negative correlation." what does positive and negative correlation with the component indicates ChatGPT In the context of Principal Component Analysis (PCA) and its loadings, positive and negative correlations with a component indicate the direction of the relationship between the original features (variables) and that specific component. Here's what positive and negative correlations mean: Positive Correlation: When a feature has a positive loading on a component, it means that an increase in the value of that feature is associated with an increase in the value of the component. In other words, the feature and the component tend to move in the same direction. Features with positive loadings contribute positively to the component, meaning they are positively associated with the pattern represented by that component. Negative Correlation: Conversely, when a feature has a negative loading on a component, it indicates that an increase in the value of that feature is associated with a decrease in the value of the component. The feature and the component tend to move in opposite directions. Features with negative loadings contribute negatively to the component, meaning they are negatively associated with the pattern represented by that component. Understanding the direction of these correlations is crucial for interpreting the principal components. It allows you to grasp the nature of the relationship between the original features and the components. Positive and negative correlations reveal how features contribute to the patterns that the principal components capture. For example, in a stock market dataset, if the first principal component has positive loadings for several technology companies, it indicates that this component represents a common positive trend among those technology companies' stock prices. Conversely, if it has negative loadings for some financial companies, it suggests that these financial companies' stock prices move in the opposite direction of the component. In practical terms, you can use this information to understand which features (companies in your dataset) are positively or negatively associated with the patterns captured by each principal component. This can aid in pattern recognition, feature selection, and data interpretation.
can we say that those who have positive correlations with the componet, have similar stock data trends? ChatGPT Yes, you can infer that features (in your case, companies' stock prices) with positive correlations to the same principal component tend to have similar stock data trends. When multiple features have positive loadings on a particular component, it indicates that these features move in the same direction and share a common pattern or trend in the data. In the context of a stock market dataset, a positive correlation among companies for a specific principal component suggests that these companies' stock prices tend to move together in a similar direction. This might indicate a common underlying factor or trend affecting those companies' stocks. For example, if the first principal component has positive loadings for several technology companies, it implies that these technology companies' stock prices are positively correlated, meaning that when one goes up, the others tend to go up as well. This could indicate a common technology industry trend. Conversely, features (companies) with negative loadings on the same component would move in the opposite direction, suggesting that they are negatively correlated with the companies that have positive loadings. By examining the loadings and correlations between features and principal components, you can identify groups of features that share common trends or patterns in your data, which can be valuable for sector analysis, risk assessment, and investment strategies in the context of a stock market dataset.
is it possible for me to see the companies who have the most similar (i.e. positive or negative in common) correlation signs with a code? ChatGPT Yes, you can write code to identify the companies that have the most similar (positive or negative) correlation signs with each other based on the loadings from PCA. This involves finding pairs of companies that have similar signs in their loadings for a given principal component. Here's how you can approach this: python Copy code import pandas as pd
component_loadings = loadings.iloc[component_number, :]
positive_correlations = component_loadings[component_loadings > 0].index negative_correlations = component_loadings[component_loadings < 0].index
similar_positive = [] similar_negative = []
for company in positive_correlations: for other_company in positive_correlations: if company != other_company: similar_positive.append((company, other_company))
for company in negative_correlations: for other_company in negative_correlations: if company != other_company: similar_negative.append((company, other_company))
print(f"Companies with the most similar positive correlations: {similar_positive}") print(f"Companies with the most similar negative correlations: {similar_negative}")
In this code: component_number represents the specific component you want to analyze. You can change this to the component of interest. We identify companies with positive and negative loadings on the selected component. positive_correlations and negative_correlations are lists of company names for each type of correlation. We iterate through these lists to find pairs of companies that have the same correlation sign (either both positive or both negative). These pairs are stored in similar_positive and similar_negative lists. Finally, we print the companies with the most similar positive and negative correlations on the selected component. This code allows you to identify pairs of companies with the most similar correlation signs with respect to the chosen principal component. It can provide insights into which companies tend to move together or in opposite directions based on that component.
i want to see this for all the components ChatGPT To see the companies that have the most similar (positive or negative) correlation signs for all components, you can modify the code to iterate through each component and collect the pairs of companies for each component. Here's how you can do it: python Copy code import pandas as pd
similar_positive = {} similar_negative = {}
for component_number in range(n_components):
# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]
# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index
# Create lists to store similar companies for the current component
similar_positive[component_number] = []
similar_negative[component_number] = []
# Iterate through the positive correlations
for company in positive_correlations:
for other_company in positive_correlations:
if company != other_company:
similar_positive[component_number].append((company, other_company))
# Iterate through the negative correlations
for company in negative_correlations:
for other_company in negative_correlations:
if company != other_company:
similar_negative[component_number].append((company, other_company))
for component_number in range(n_components): print(f"Component {component_number + 1}:") print(f"Companies with the most similar positive correlations: {similar_positive[component_number]}") print(f"Companies with the most similar negative correlations: {similar_negative[component_number]}\n")
In this code, we iterate through each component and identify pairs of companies with similar correlation signs for positive and negative correlations. The results are stored in dictionaries where the keys represent the component number, and the values are lists of company pairs with similar correlations for that component. The code then prints these results for all components. This allows you to explore which companies tend to move together or in opposite directions for each principal component in your PCA analysis.
i see that this code gives me pairs of companies, but can i see the list of those companies instead ChatGPT If you want to see the lists of companies with similar (positive or negative) correlation signs for each component instead of pairs of companies, you can modify the code to collect the lists of companies and display them. Here's how you can do that: python Copy code import pandas as pd
similar_positive = {} similar_negative = {}
for component_number in range(n_components):
# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]
# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index
# Store the lists of companies for the current component
similar_positive[component_number] = list(positive_correlations)
similar_negative[component_number] = list(negative_correlations)
for component_number in range(n_components): print(f"Component {component_number + 1}:") print(f"Companies with positive correlations: {similar_positive[component_number]}") print(f"Companies with negative correlations: {similar_negative[component_number]}\n")
This modified code collects the lists of companies with similar correlations for positive and negative correlations on each component. It then prints these lists for all components, providing you with a clearer view of which companies share similar correlations for each principal component.
can i see the companies that behave in the most similar way? the companies that have the most number of the same correlation sign for a component in total? ChatGPT If you want to find the companies that behave in the most similar way, meaning they share the most number of the same correlation signs (either positive or negative) for a component across all components, you can modify the code to keep track of such companies. Here's how you can do it: python Copy code import pandas as pd
similar_companies_positive = {} similar_companies_negative = {}
for component_number in range(n_components):
# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]
# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index
# Update the dictionaries to include the companies for the current component
for company in positive_correlations:
if company not in similar_companies_positive:
similar_companies_positive[company] = set()
similar_companies_positive[company].add(component_number)
for company in negative_correlations:
if company not in similar_companies_negative:
similar_companies_negative[company] = set()
similar_companies_negative[company].add(component_number)
def find_most_similar_companies(similar_companies_dict, threshold): return [company for company, components in similar_companies_dict.items() if len(components) >= threshold]
threshold = n_components # All components must have the same correlation sign
most_similar_positive_companies = find_most_similar_companies(similar_companies_positive, threshold) most_similar_negative_companies = find_most_similar_companies(similar_companies_negative, threshold)
print("Companies with the most similar positive correlations:") print(most_similar_positive_companies)
print("\nCompanies with the most similar negative correlations:") print(most_similar_negative_companies)
In this code: We create dictionaries (similar_companies_positive and similar_companies_negative) to track which companies share the same correlation sign for each component. For each component, we update these dictionaries to include the companies that have positive and negative correlations. We define a function, find_most_similar_companies, which allows us to find companies with the most similar correlations based on a threshold. In this case, the threshold is set to the total number of components (n_components), meaning that companies must have the same correlation sign for all components to be considered the most similar. We find companies with the most similar positive and negative correlations based on the threshold and print the results. This code will give you a list of companies that behave in the most similar way by having the same correlation signs across all components. You can adjust the threshold to control the level of similarity required.
please dont seperate between positive and negative in the similarity part. I want to see the list of pairs of companies with the most common same sign numbers, list the number too ChatGPT Certainly, if you want to find pairs of companies that have the most common same-sign numbers (either positive or negative) across all components and list the number of times they share the same sign, you can modify the code as follows: python Copy code import pandas as pd
similar_sign_counts = {}
for component_number in range(n_components):
# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]
# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index
# Update the dictionary with positive correlations
for i, company1 in enumerate(positive_correlations):
for j, company2 in enumerate(positive_correlations):
if i != j:
pair = (company1, company2)
if pair not in similar_sign_counts:
similar_sign_counts[pair] = 0
similar_sign_counts[pair] += 1
# Update the dictionary with negative correlations
for i, company1 in enumerate(negative_correlations):
for j, company2 in enumerate(negative_correlations):
if i != j:
pair = (company1, company2)
if pair not in similar_sign_counts:
similar_sign_counts[pair] = 0
similar_sign_counts[pair] += 1
max_count = max(similar_sign_counts.values()) most_common_pairs = [pair for pair, count in similar_sign_counts.items() if count == max_count]
print("Pairs of companies with the most common same-sign numbers:") for pair in most_common_pairs: count = similar_sign_counts[pair] print(f"Pair: {pair}, Common Same-Sign Count: {count}")
In this code: We create a dictionary similar_sign_counts to track the count of same-sign numbers for pairs of companies. We iterate through each component and, for positive and negative correlations, update the dictionary with pairs of companies that share the same sign. If a pair doesn't exist in the dictionary, we initialize it with a count of 0 and then increment the count. After processing all components, we find the pairs of companies with the most common same-sign numbers by identifying pairs with the maximum count. Finally, we print the pairs of companies and their common same-sign counts. This code provides you with pairs of companies that have the most common same-sign numbers across all components, along with the count of times they share the same sign.
can you list the first 10 pairs ChatGPT Certainly, to list the first 10 pairs of companies with the most common same-sign numbers across all components, you can modify the code to display a limited number of pairs. Here's how you can do it: python Copy code import pandas as pd
similar_sign_counts = {}
for component_number in range(n_components):
# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]
# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index
# Update the dictionary with positive correlations
for i, company1 in enumerate(positive_correlations):
for j, company2 in enumerate(positive_correlations):
if i != j:
pair = (company1, company2)
if pair not in similar_sign_counts:
similar_sign_counts[pair] = 0
similar_sign_counts[pair] += 1
# Update the dictionary with negative correlations
for i, company1 in enumerate(negative_correlations):
for j, company2 in enumerate(negative_correlations):
if i != j:
pair = (company1, company2)
if pair not in similar_sign_counts:
similar_sign_counts[pair] = 0
similar_sign_counts[pair] += 1
sorted_pairs = sorted(similar_sign_counts.items(), key=lambda x: x[1], reverse=True)
print("First 10 pairs of companies with the most common same-sign numbers:") for pair, count in sorted_pairs[:10]: print(f"Pair: {pair}, Common Same-Sign Count: {count}")
This modified code will display the first 10 pairs of companies with the most common same-sign numbers across all components, along with the count of times they share the same sign.
i want to see the first 30 pairs but only print the odd numbers ChatGPT If you want to display the first 30 pairs of companies with the most common same-sign numbers but only print the odd-numbered pairs, you can modify the code as follows: python Copy code import pandas as pd
similar_sign_counts = {}
for component_number in range(n_components):
# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]
# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index
# Update the dictionary with positive correlations
for i, company1 in enumerate(positive_correlations):
for j, company2 in enumerate(positive_correlations):
if i != j:
pair = (company1, company2)
if pair not in similar_sign_counts:
similar_sign_counts[pair] = 0
similar_sign_counts[pair] += 1
# Update the dictionary with negative correlations
for i, company1 in enumerate(negative_correlations):
for j, company2 in enumerate(negative_correlations):
if i != j:
pair = (company1, company2)
if pair not in similar_sign_counts:
similar_sign_counts[pair] = 0
similar_sign_counts[pair] += 1
sorted_pairs = sorted(similar_sign_counts.items(), key=lambda x: x[1], reverse=True)
print("First 30 odd-numbered pairs of companies with the most common same-sign numbers:") for i, (pair, count) in enumerate(sorted_pairs[:60], start=1): if i % 2 != 0: print(f"Pair: {pair}, Common Same-Sign Count: {count}")
This modified code will display the first 30 odd-numbered pairs of companies with the most common same-sign numbers across all components, along with the count of times they share the same sign.
i want to reduce the dimensions, i.e number of components? how can i pick the ideal number of components? ChatGPT To reduce the dimensionality in a PCA analysis and choose the ideal number of components, you can use the concept of explained variance. The goal is to retain a sufficient amount of variance while reducing the number of components. Here are the steps to pick the ideal number of components: Calculate Explained Variance: After performing PCA, you have a list of explained variance for each component. This list tells you how much variance in the original data each component captures. Plot Explained Variance: Create a scree plot or cumulative explained variance plot. In the scree plot, you'll see the explained variance for each component. In the cumulative explained variance plot, you'll see the cumulative variance explained as you include more components. Select a Threshold: Determine the threshold for the amount of variance you want to retain. For example, you might decide to retain 95% of the total variance. Choose the Number of Components: Count the number of components required to exceed the selected threshold in the cumulative explained variance plot. This is your ideal number of components. Here's an example of how to do this in Python using the scikit-learn library: python Copy code import pandas as pd from sklearn.decomposition import PCA import matplotlib.pyplot as plt
pca = PCA() pca.fit(data)
explained_variance = pca.explained_varianceratio cumulative_explained_variance = explained_variance.cumsum()
plt.figure(figsize=(10, 5)) plt.bar(range(1, len(explained_variance) + 1), explained_variance, label="Explained Variance") plt.step(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, where='mid', label="Cumulative Explained Variance") plt.xlabel('Number of Components') plt.ylabel('Variance Explained') plt.legend() plt.grid() plt.show()
threshold = 0.95 n_components = sum(cumulative_explained_variance <= threshold) + 1 print(f"Ideal number of components to retain {threshold}% of variance: {n_components}")
In the code above, you'll perform PCA, plot the explained variance and cumulative explained variance, and choose the number of components that allows you to retain the desired amount of variance (in this case, 95%). Adjust the threshold to meet your specific requirements for dimensionality reduction.
i already applied PCA, i had 60 components, first 13 covers the 95% of the variance. what should i do with this result? should i redo the pca again with selecting the number of components as 13? ChatGPT If you have already applied PCA and found that the first 13 components cover 95% of the variance in your data, you have a choice to make based on your goals and requirements: Retain the First 13 Components: If the first 13 components capture 95% of the variance and provide meaningful information for your analysis, you can choose to retain only these 13 components. Redoing PCA with fewer components can help reduce the dimensionality of your data, making it computationally more efficient and easier to interpret. Reapply PCA with Fewer Components: If you want to apply PCA with only 13 components, you can reapply PCA with the specific number of components (13) rather than the original 60. This can be beneficial if you prefer working with a reduced dataset or need to perform further analyses with the reduced dimensionality. The decision between these two options depends on your specific use case and whether retaining the first 13 components provides adequate information for your analysis. Reducing the number of components can simplify your data, but it's important to ensure that the reduced dataset still captures the essential patterns and relationships in your data. If you choose to retain the first 13 components, you can simply use those components in your subsequent analysis without reapplying PCA.
"# Select all columns except the first column (timestamps) columns_to_analyze = allticks_data.columns[1:] # Calculate the variance and standard deviation for each selected column variances = allticks_data[columns_to_analyze].var() std_deviations = allticks_data[columns_to_analyze].std() # Create a new DataFrame to store the results statistics_df = pd.DataFrame({ 'Column': columns_to_analyze, 'Variance': variances, 'Standard Deviation': std_deviations }) # Display the calculated statistics print(statistics_df) statistics_df.to_excel('stats.xlsx',index=False)" can i also add coefficient of variance to this code ChatGPT Certainly, you can calculate the coefficient of variation (CV) and add it to your statistics DataFrame. The coefficient of variation is a measure of relative variability and is calculated as the ratio of the standard deviation to the mean. Here's how you can modify your code to include the coefficient of variation: python Copy code import pandas as pd
columns_to_analyze = allticks_data.columns[1:]
variances = allticks_data[columns_to_analyze].var() std_deviations = allticks_data[columns_to_analyze].std() means = allticks_data[columns_to_analyze].mean() coeff_of_variation = (std_deviations / means) * 100 # Multiply by 100 to express as a percentage
statistics_df = pd.DataFrame({ 'Column': columns_to_analyze, 'Variance': variances, 'Standard Deviation': std_deviations, 'Coefficient of Variation': coeff_of_variation })
print(statistics_df)
statistics_df.to_excel('stats.xlsx', index=False)
In this modified code, we calculate the coefficient of variation by dividing the standard deviation by the mean and then multiplying by 100 to express it as a percentage. The resulting statistics DataFrame now includes the coefficient of variation along with variance and standard deviation. Additionally, it saves the statistics to an Excel file as you originally requested.